* This do-file reads in raw datasets on policy firms from Credit Bureau database and saves clean versions of the data.
* Authors: Ellora Derenoncourt & Monica Essig Aberg

* ------------------------------------------------------------------------------
*1. National monthly wage distributions dataset for policy employers
* ------------------------------------------------------------------------------
	* Read in raw Excel file on company wage distributions, restrict policy employers
	import excel using "$data/cb/raw/[Project Name].xlsx", clear first sheet(result1)
	keep if inlist(cmp,22222,11111,33333,55555,44444,66666)

	* Create monthly date and year variables 
	gen str7 str_monthyear = substr(string(archive), 1, 4) + "-" + substr(string(archive), 5, 2) + "-01"
	gen mdate = monthly(str_monthyear, "YM")
	format mdate %tm
	gen month=month(dofm(mdate))
	gen year=yofd(dofm(mdate))

	* Generate total employment variable 
	egen tot_emp=rowtotal(wage_lt30 wage_30)
	la var tot_emp "Total employment"
	
	* Interpolate employment for non-reporting months identified as outliers in employment series.
	preserve
		collapse (sum) tot_emp, by(mdate cmp_company_code)
		xtset cmp_company_code mdate
		tssmooth ma emp_ma = tot_emp, window(1 1 1)
		gen outlier = tot_emp < emp_ma*.75
		levelsof cmp_company_code, local(comp)

		keep mdate cmp_company_code outlier
		tempfile outliers
		save `outliers', replace
	restore

	merge 1:1 mdate cmp_company_code using `outliers', nogen

	xtset cmp_company_code mdate

	foreach var of varlist wage* tot_emp {
		gen adj_`var' = `var'
		replace adj_`var' = (L.`var' + F.`var')/2 if outlier==1
	}

	* Correct for Company 22222 non-reporting in November and December 2016	
	foreach var of varlist adj* {
		replace `var' = (L2.`var' + F2.`var')/2 if archive==201612 & cmp==22222
	}

	foreach var of varlist adj* {
		replace `var' = (L2.`var' + F2.`var')/2 if archive==201611 & cmp==22222
	}	

	* Correct for Company 66666 non-reporting months in 2014 and 2016		
	egen yearly_mean_emp=mean(tot_emp),by(year)
	preserve
	keep if year==2013
	local mean_emp_2013=yearly_mean_emp
	restore
	preserve
	keep if year==2015
	local mean_emp_2015=yearly_mean_emp
	restore	
	preserve
	keep if year==2017
	local mean_emp_2017=yearly_mean_emp
	restore		
	local emp_growth_yearly1=((`mean_emp_2015'-`mean_emp_2013')/`mean_emp_2013'*2)
	local emp_growth_yearly2=((`mean_emp_2017'-`mean_emp_2015')/`mean_emp_2015'*2)
	foreach var of varlist adj* {
		replace `var' = . if cmp==66666 & (inrange(archive,201402,201410) | inrange(archive,201607,201608))
		replace `var'=`var'[_n-12]*(1+`emp_growth_yearly1') if `var'==. & year==2014
		replace `var'=`var'[_n-12]*(1+`emp_growth_yearly2') if `var'==. & year==2016		
	}	
	drop yearly_mean_emp

	* Keep adjusted variables only
	drop outlier
	rename tot_emp tot_emp_unadj
	drop wage*	
	rename adj_* *
	
	* Generate share below wage bin variables
	forval i=8(1)29{
		gen share_wage_lt`i'=wage_lt`i'/(tot_emp)
		la var share_wage_lt`i' "Share below $`i'"
	}		

	* Generate wage bin variables and wage equal to or greater than variables 
	forval i=8(1)29{
		local n=`i'+1
		gen wage_gt`i'=wage_`i' 
		la var wage_gt`i' "Wage equal to or greater than $`i'"
		replace wage_`i'=wage_lt`n'-wage_lt`i'
	}	
	
	la var tot_emp_unadj "Total employment, unadjusted"
	la var tot_emp "Total employment, adjusted for Dec. non-reporting"

	rename wage_lt8 wage_7
	
	* Generate average wage among workers earning $8-29 
	preserve
	drop wage_30
	drop wage_lt* wage_gt* 
	keep wage_* mdate cmp  
	reshape long wage_, i(mdate cmp) j(dollar)
	rename wage_ n_workers

	egen tot_emp_7_29=sum(n_workers),by(cmp mdate)

	replace n_workers=n_workers/tot_emp_7_29

	gen weighted_wage=n_workers*dollar
	collapse (sum) wage=weighted_wage (mean) tot_emp_7_29, by(cmp mdate)
	gen ln_wage=log(wage)
	keep wage ln_wage cmp mdate
	tempfile avg_wage
	save `avg_wage'
	restore
	 
	merge 1:1 cmp mdate using `avg_wage', keepusing(wage ln_wage) nogen	 
	
	la var wage "Average wage"
	la var ln_wage "Log average wage"
	
	save "$data/cb/clean_policy_firm_national.dta", replace

* ------------------------------------------------------------------------------
*2. Policy employers events dataset
* ------------------------------------------------------------------------------
	use "$data/cb/clean_policy_firm_national.dta", clear

	levelsof cmp, local(levels_cmp)
	
	foreach c of local levels_cmp {
		use "$data/cb/clean_policy_firm_national.dta", clear
		keep archive mdate cmp_company_code share_wage_lt* tot_emp
			
		keep if cmp==`c'
			
		forval i=8(1)28{
							local n=`i'+1
							gen event`n'=0
				} 
				forval i=9(1)14{ // Separate criteria for sub-15 minimum wages
							replace event`i'=(share_wage_lt`i'>=.10 & share_wage_lt`i'[_n+1]<.05 & share_wage_lt`n'[_n+1]>=.10) 

			}
				forval i=15(1)15{ // Separate critera for 15 minimum wage
							replace event`i'=(share_wage_lt`i'>=.20 & share_wage_lt`i'[_n+1]<.10 & share_wage_lt`n'[_n+1]>=.20)

			}	
				forval i=16(1)29{ // Separate criteria for above 15 minimum wages
							replace event`i'=(share_wage_lt`i'>=.10 & share_wage_lt`i'[_n+1]<.05 & share_wage_lt`n'[_n+1]>=.10)

			}	

		* Generate variables characterizing each minimum wage policy
		gen share_affected=.
		la var share_affected "Share of workers affected by firm policy"
		gen missing_mass=. 
		la var missing_mass "Missing mass below minimum wage"
		gen excess_mass0=. 
		la var excess_mass0 "Excess mass at exactly minimum wage"
		gen excess_mass1=. 
		la var excess_mass1 "Excess mass at or upto $1 above minimum wage"
		gen excess_mass2=.
		la var excess_mass2 "Excess mass at or upto $2 above minimum wage"
		gen mw=.

		forval i=9(1)26{
			local n=`i'+1
			local m=`i'+2
			local o=`i'+3
			replace mw = `i' if event`i'==1
			replace share_affected=share_wage_lt`i'[_n-1] if event`i'==1
			replace missing_mass=share_wage_lt`i'[_n+1]-share_wage_lt`i' if event`i'==1
			replace excess_mass2=share_wage_lt`o'[_n+1]-share_wage_lt`i'[_n+1] if event`i'==1
			replace excess_mass1=share_wage_lt`m'[_n+1]-share_wage_lt`i'[_n+1] if event`i'==1
			replace excess_mass0=share_wage_lt`n'[_n+1]-share_wage_lt`i'[_n+1] if event`i'==1		
		}	
		
		* Flag rows containing events
		egen sum_event=rowtotal(event9-event29)
		
		* Keep only event rows
		drop if sum_event==0
		
		* Keep only event-related variables and save a tempfile for each company's events
		keep cmp_company_code archive mdate mw share_affected missing_mass excess_mass* tot_emp
		tempfile event_`c'
		save `event_`c''
		use `event_`c'', clear
		}
	clear
	
	* Append together all company events
	foreach c of local levels_cmp {
		append using `event_`c''
		}
	
	* Remove duplicate events (just one, for Company Code 33333)
	quietly bysort cmp mw:  gen dup = cond(_N==1,0,_n)
	drop if dup>1 & mdate> mdate[_n-1] & cmp==cmp[_n-1]
	
	drop dup 

	* Create an event identifier
	gen eventid=_n
	
	* Calculate absolute value of missing mass
	replace missing_mass=abs(missing_mass)
	
	* Calculate number of months since last policy
	gen months_since_last_policy=mdate-mdate[_n-1] if cmp==cmp[_n-1]
	
	* Calculate number of months until next policy
	gen months_until_next_policy=mdate[_n+1]-mdate if cmp==cmp[_n+1]
	
	* Replace with number of months since start of the dataset if first policy
	replace months_since_last_policy=mdate-tm(2013m1) if months_since_last_policy==. & cmp!=55555
	replace months_since_last_policy=mdate-tm(2013m9) if months_since_last_policy==. & cmp==55555 // Company 55555's earliest archive is 201309
	
	* Replace with number of months until end of the dataset if last policy
	replace months_until_next_policy=tm(2023m08)-mdate if months_until_next_policy==.
		
	* Save data in csv and dta format
	outsheet using "$data/cb/events.csv", comma replace	
	save "$data/cb/events.dta", replace	

* ------------------------------------------------------------------------------
*3. CZ-level flows from policy to non-policy
* ------------------------------------------------------------------------------
	* Merging in NAICS info
	* Get 3-digit NAICS codes and names
	import excel using "$data/cb/raw/[Project Name].xlsx", sheet("Avg. Hourly Pay NAICS") cellrange(A4:B92) clear firstrow
	drop if er_naics==.
	rename NAICSD er_naics_name
	tempfile industry_names
	save `industry_names'	
	
	* Get NAICS codes 

	insheet using  "$data/cb/raw/[Project Name]_result1_8_ernaics.csv", clear
	contract cmp_company_code er_naics
	quietly bysort cmp_company_code:  gen dup = cond(_N==1,0,_n)
	tab dup
	drop if dup>1
	tempfile cmp_naics
	save `cmp_naics'
	
	foreach yearset in "2013_2016" "2017_2019" "2020_2021" "2022_2023" {
	
		insheet using "$data/cb/raw/[Project Name]_result1_7_wage_`yearset'.csv", clear
		
		* Create monthly date and year variables 
		gen str7 str_monthyear = substr(string(wt_), 1, 4) + "-" + substr(string(wt_), 5, 2) + "-01"
		gen mdate = monthly(str_monthyear, "YM")
		format mdate %tm
		gen month=month(dofm(mdate))
		gen year=yofd(dofm(mdate))

		rename czone cz
		gen wage=wage_flag
		replace wage="wage_7" if wage_flag=="wage_lt8"
		replace wage="wage_30" if wage_flag=="wage_gt30"
		replace wage="" if wage_flag=="other"
		replace wage=substr(wage, 6,length(wage))		
		destring wage, replace			
		tempfile `yearset'
		save ``yearset''		
	}
	
	clear
	
	foreach yearset in "2013_2016" "2017_2019" "2020_2021" "2022_2023" {
		append using ``yearset''
	}	
	
	* Merge in data on company industry
	rename cmp_company_code_new cmp_company_code
	merge m:1 cmp_company_code using `cmp_naics' , keepusing(er_naics) 
	drop if _merge==2
	drop _merge
	
	* Add 3-digit NAICS names
	merge m:1 er_naics using `industry_names', keepusing(er_naics_name)
	drop if _merge==2
	drop _merge

	gen sample_ind= inlist(er_naics,561,452,722,445,448,492,454,444,451)

	save "$data/cb/raw_policy_to_nonpolicy_new_hires.dta", replace
	
* ------------------------------------------------------------------------------
*4. CZ-level new hires for policy employers
* ------------------------------------------------------------------------------
	* Load new hires from non-policy companies
	insheet using "$data/cb/raw/[Project Name]_result1_7_czone_nonpol_pol_hire_date.csv", clear
	
	* Create monthly date and year variables 
	gen str7 str_monthyear = substr(string(wt_), 1, 4) + "-" + substr(string(wt_), 5, 2) + "-01"
	gen mdate = monthly(str_monthyear, "YM")
	format mdate %tm
	gen month=month(dofm(mdate))
	gen year=yofd(dofm(mdate))
	
	collapse (sum) new_hires_nonpolicy = freq,by(mdate cmp*new czone)	

	rename cmp_company_code_new cmp_company_code
	drop if cmp_company_code=="other"
	destring cmp_company_code, replace
	rename czone cz	

	gen ln_new_hires_nonpolicy=log(new_hires_nonpolicy)
	
	la var new_hires_nonpolicy "New hires from non-policy firms"
	la var ln_new_hires_nonpolicy "Log new hires from non-policy firms"
	
	tempfile new_hires_nonpolicy
	save `new_hires_nonpolicy', replace
	
	* Load new hires data
	foreach dateset in "201301_201712" "201801_202112" "202201_202308" {
		insheet using "$data/cb/raw/[Project Name]_result1_3_czone_allcompany_`dateset'.csv", clear
			keep if inlist(cmp_company_code,22222, 44444, 55555, 66666, 33333, 11111 )
		tempfile newhires_`dateset'
		save `newhires_`dateset'', replace
	}
	clear
	foreach dateset in "201301_201712" "201801_202112" "202201_202308" {
		append using `newhires_`dateset''
	}
	
	* Create monthly date and year variables 
	gen str7 str_monthyear = substr(string(wt_original), 1, 4) + "-" + substr(string(wt_original), 5, 2)
	drop if strlen(string(wt_original)) == 5 // duplicates
	gen mdate = monthly(str_monthyear, "YM")
	format mdate %tm
	gen month=month(dofm(mdate))
	
	gen year=yofd(dofm(mdate))	
	
	* Rename CZ variable
	rename czone cz
		
	* Generate total new hires variable
	egen tot_new_hires=rowtotal(wage_lt8 wage_8 wage_9 wage_10 wage_11 wage_12 wage_13 wage_14 wage_15 wage_16 wage_17 wage_18 wage_19 wage_20 wage_21 wage_22 wage_23 wage_24 wage_25 wage_26 wage_27 wage_28 wage_29 wage_gt30)		
	la var tot_new_hires "Total new hires"
	egen tot_new_hires_7_29=rowtotal(wage_lt8 wage_8 wage_9 wage_10 wage_11 wage_12 wage_13 wage_14 wage_15 wage_16 wage_17 wage_18 wage_19 wage_20 wage_21 wage_22 wage_23 wage_24 wage_25 wage_26 wage_27 wage_28 wage_29)		
	la var tot_new_hires_7_29 "Total new hires with wage < $30"
	drop wage* 
	
	* Generate log new hires variables
	foreach var of varlist tot_new_hires* {
		gen ln_`var'=log(`var')
	}
	
	* Generate year over year difference
	egen id = group(cmp_company_code cz)
	xtset id mdate
	foreach var of varlist tot_new_hires* ln_tot* {
		gen yoy_`var' = `var'-L12.`var'
	}
	
	* Clean
	keep cmp_company_code cz mdate month year tot_new_hires* ln_tot* yoy_* 

	* Save
	tempfile new_hires_full
	save `new_hires_full', replace
	
	merge 1:1 cmp_company_code mdate cz using `new_hires_nonpolicy', keepusing(ln_new_hires_nonpolicy new_hires_nonpolicy)
	replace new_hires_nonpolicy=0 if new_hires_nonpolicy==.
	
	* Labels
	label variable cz "Commuting zone"
	label variable cmp "Company code"
	label variable mdate "Month"
	label variable month "Month (1-12)"
	label variable year "Year"
	label variable tot_new_hires "Total new hires"
	label variable tot_new_hires_7 "Total new hires with wage < $30 "
	foreach var of varlist tot* {
		local name: variable label `var'
		label variable ln_`var' "Log of `name'"
	}
	foreach var of varlist tot* ln_tot* {
		local name: variable label `var'
		label variable yoy_`var' "YoY diff. of `name'"
	}
	
	* Save clean cz-month-company data
	save "$data/cb/clean_policy_new_hires_cz.dta", replace

* ------------------------------------------------------------------------------
*5. CZ-level separations for policy employers
* ------------------------------------------------------------------------------
	* Load separations data

	use "$data/cb/raw_policy_to_nonpolicy_new_hires.dta", clear
	collapse (sum) freq, by(cmp_company_code_previous mdate cz)
	drop if cmp_company_code_previous=="other"
	destring cmp_company_code_previous, gen(cmp_company_code)
	drop *previous
	
	rename freq sep_to_nonpolicy
	gen ln_sep_to_nonpolicy=log(sep_to_nonpolicy)
	
	la var sep_to_nonpolicy "Separations to non-policy"
	la var ln_sep_to_nonpolicy "Log separations to non-policy"
	tempfile sep_to_nonpolicy
	save `sep_to_nonpolicy'

	* Load full separations data
	foreach dateset in "201301_201712" "201801_202112" "202201_202308" {
		insheet using "$data/cb/raw/result1_2_czone_allcompany_v2_`dateset'.csv", clear
		keep if inlist(cmp_company_code,22222, 44444, 55555, 66666, 33333, 11111 )
		tempfile separations_`dateset'
		save `separations_`dateset'', replace
	}
	clear
	foreach dateset in "201301_201712" "201801_202112" "202201_202308" {
		append using `separations_`dateset''
	}

	* Create monthly date and year variables 
	gen str7 str_monthyear = substr(string(date_of_termination), 1, 4) + "-" + substr(string(date_of_termination), 5, 2)
	gen mdate = monthly(str_monthyear, "YM")
	format mdate %tm
	*replace mdate=mdate+1
	gen month=month(dofm(mdate))
	gen year=yofd(dofm(mdate))	
	
	* Rename CZ variable
	rename czone cz
		
	* Generate total new hires variable
	egen tot_separations=rowtotal(wage_lt8 wage_8 wage_9 wage_10 wage_11 wage_12 wage_13 wage_14 wage_15 wage_16 wage_17 wage_18 wage_19 wage_20 wage_21 wage_22 wage_23 wage_24 wage_25 wage_26 wage_27 wage_28 wage_29 wage_gt30)		
	la var tot_separations "Total separations"
	egen tot_separations_7_29=rowtotal(wage_lt8 wage_8 wage_9 wage_10 wage_11 wage_12 wage_13 wage_14 wage_15 wage_16 wage_17 wage_18 wage_19 wage_20 wage_21 wage_22 wage_23 wage_24 wage_25 wage_26 wage_27 wage_28 wage_29)		
	la var tot_separations_7_29 "Total separations with wage < $30"
	egen tot_separations_7_15=rowtotal(wage_lt8 wage_8 wage_9 wage_10 wage_11 wage_12 wage_13 wage_14 wage_15)		
	la var tot_separations_7_15 "Total separations with wage < $16"	
	
	drop wage* 
	
	* Replace
	egen id = group(cmp_company_code cz)
	xtset id mdate
	foreach var of varlist tot_separations* {
		gen adj_`var' = `var'
	}

	foreach var of varlist tot_separations* {
		replace adj_`var' = L2.`var' if date_of_termination>=202307
	}		

	* Keep adjusted variables only, except total employment
	rename tot_separations tot_separations_unadj
	drop tot_separations_7_29 tot_separations_7_15	
	rename adj_* *
	
	* Generate log new hires variables
	foreach var of varlist tot_separations* {
		gen ln_`var'=log(`var')
	}

	* Generate year over year difference
	xtset id mdate
	foreach var of varlist tot_separations* ln* {
		gen yoy_`var' = `var'-L12.`var'
	}
	
	* Clean
	keep cmp_company_code cz mdate month year tot_separations* yoy* ln_* 

	* Labels
	label variable cz "Commuting zone"
	label variable cmp "Company code"
	label variable mdate "Month"
	label variable month "Month (1-12)"
	label variable year "Year"
	label variable tot_separations_unadj "Total separations (unadjusted)"
	label variable tot_separations "Total separations (adjusted)"
	label variable tot_separations_7_29 "Total separations with wage < $30 (adjusted)"
	label variable tot_separations_7_15 "Total separations with wage < $16 (adjusted)"
	foreach var of varlist tot* {
		local name: variable label `var'
		label variable ln_`var' "Log of `name'"
	}

	foreach var of varlist tot* ln* {
		local name: variable label `var'
		label variable yoy_`var' "YoY diff. of `name'"
	}
	quietly bysort cmp cz mdate: gen dup = cond(_N==1,0,_n)
	tab dup
		
	merge 1:1 cmp_company_code cz mdate using `sep_to_nonpolicy', keepusing(sep_to_nonpolicy ln_sep_to_nonpolicy) 
	replace sep_to_nonpolicy=0 if sep_to_nonpolicy==.
		
	* Save clean cz-month-company data
	save "$data/cb/clean_policy_separations_cz.dta", replace
	
* ------------------------------------------------------------------------------
*6. CZ-level monthly wage distributions dataset for policy employers
* ------------------------------------------------------------------------------	
	foreach company in "11111" "66666" "55555" "44444" "22222" "33333" {
		
		* Load data
		if "`company'"=="22222"{
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_22222_part1.csv", clear 
			tempfile part1
			save `part1'
			
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_22222_part2.csv", clear 
			append using `part1'		
		}
		else {
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_`company'.csv", clear 
		}

		* ID wage bins
		gen wage_bin=subinstr(wage_flag,"wage_","",.)
		replace wage_bin="30" if wage_bin=="gt30"
		replace wage_bin="7" if wage_bin=="lt8"
		replace wage_bin="" if wage_bin=="null"
		destring wage_bin, replace
		drop if wage_bin==.
		drop wage_flag
		
		* Create monthly date and year variables 
		gen str7 str_monthyear = substr(string(archive), 1, 4) + "-" + substr(string(archive), 5, 2) + "-01"
		gen mdate = monthly(str_monthyear, "YM")
		format mdate %tm
		gen month=month(dofm(mdate))
		gen year=yofd(dofm(mdate))
		la var year "Year"
		drop str_monthyear
		
		* Calculate total employment
		gen n_emp = freq
		bysort cz mdate: egen tot_emp = sum(n_emp)

		* Calculate total retention
		gen n_retain = freq_previousjoin
		bysort cz mdate: egen tot_retain = sum(n_retain)
	
		* Drop highest wage bin
		drop if wage_bin == 30
		
		* Generate averages w/in wage bin --------------------------------------
		
		* Average hourly wage rate using mean wage in bins
		gen avg_wage_exact=total_gross_pay_rate/freq_gross_pay_rate
		gen flag_avg_wage_exact=0 
		replace flag_avg_wage_exact=1 if avg_wage_exact>=wage_bin+1
		replace flag_avg_wage_exact=1 if avg_wage_exact<wage_bin
		gen gap=avg_wage_exact-wage_bin
		replace flag_avg_wage_exact=0 if abs(gap)<.01
		
		* Average hourly wage rate using lower bound of wage bins
		gen avg_wage_bins = wage_bin
		
		* Average monthly base pay
		gen avg_monthly_pay_ytd=total_gross_pay_ytd/month_worked_ytd
		gen avg_monthly_pay=currentmonthearn/freq_currentmonthearn // CHANGED NAME TO _curr
		
		* Average monthly hours (binsd)
		gen avg_monthly_hours_ytd=avg_monthly_pay_ytd/avg_wage_exact
		gen avg_monthly_hours=avg_monthly_pay/avg_wage_exact
		
		* Average weekly hours (binsd)
		gen avg_weekly_hours_ytd=avg_monthly_hours_ytd/4.29
		gen avg_weekly_hours=avg_monthly_hours/4.29
		
		* Average gross pay by category
		foreach type in "overtime" "bonus" "commis" "other" "total" {
			gen avg_grs_`type' = total_ep_gross_`type'/month_worked_ep_gross_`type'
		}
		
		* Generate log of wage bin averages. Assumes all workers in bin earn the exact same amount
		foreach var in avg_wage_exact avg_wage_bins avg_monthly_pay_ytd avg_monthly_pay avg_monthly_hours_ytd avg_monthly_hours avg_weekly_hours_ytd avg_weekly_hours avg_grs_overtime avg_grs_bonus avg_grs_commis avg_grs_other avg_grs_total{
			gen ln_`var'=log(`var')
		}
		
		* Generate total counts within cz-month --------------------------------
		
		// NOTE: freq_gross_pay_rate ALMOST EQUIVALENT TO freq, BUT 2,000 CELLS NOT EXACTLY
		* Number emp with reported wage < 30
		gen n_emp_7_29 = freq
		bysort cz mdate: egen tot_emp_7_29 = sum(n_emp_7_29)
		
		* Calculate total retention with reported wage <30
		gen n_retain_7_29 = freq_previousjoin
		bysort cz mdate: egen tot_retain_7_29 = sum(n_retain_7_29)
		
		* Number emp with ytd gross pay reported, wage < 30
		egen tot_freq_gross_pay_ytd = sum(freq_gross_pay_ytd), by(archive cz)
		
		* Number emp with matched monthly pay, wage < 30
		egen tot_freq_currentmonthearn = sum(freq_currentmonthearn), by(archive cz)

		* Generate weighted avgs within cz-month -------------------------------

		foreach var of varlist avg_wage* avg_monthly* avg_weekly* avg_grs* ln_* {
			gen wt_`var'=`var'*n_emp
			egen wt_sum_`var'=sum(wt_`var'), by(archive cz)
			gen wt_avg_`var'=wt_sum_`var'/tot_emp_7_29
		}
		
		* Reshape wide for counts in each wage bin -----------------------------
		
		preserve
			keep cz mdate n_emp n_retain wage_bin
			reshape wide n_emp n_retain, i(cz mdate) j(wage_bin)
			foreach var of varlist n_emp* n_retain* {
				replace `var' = 0 if `var' == .
			}
			tempfile emp
			save `emp', replace
		restore
		
		* Collapse to cz-month -------------------------------------------------
		
		collapse (mean) wt_avg_ln* wt_avg_avg* tot_emp* tot_retain* tot_freq_gross_pay_ytd tot_freq_currentmonthearn, by(mdate year month cz)
		
		rename wt_avg_avg* avg* 
		rename wt_avg_ln* avg_ln*
		
		* Merge in counts in each wage bin
		merge 1:1 cz mdate using `emp', nogen
		
		* Correct for low reporting/other anomolies ----------------------------
		
		* Flag low-reporting months for ytd variables
		preserve
			collapse (sum) tot_freq_gross_pay_ytd, by(mdate)
			tsset mdate
			tssmooth ma tot_freq_gross_pay_ytd_ma = tot_freq_gross_pay_ytd, window(1 0 1)
			gen outlier_ytd = tot_freq_gross_pay_ytd < tot_freq_gross_pay_ytd_ma*.75
			*graph tw (line tot_freq_gross_pay_ytd mdate) (scatter tot_freq_gross_pay_ytd mdate if outlier_ytd == 1)
			*graph export "$figures_tables/cb/quality_checks/tot_freq_gross_pay_ytd_`company'_ts.png", replace
			tempfile outliers_ytd
			save `outliers_ytd', replace
		restore

		* Flag low-reporting months for pay rate variables
		preserve
			collapse (sum) tot_emp, by(mdate)
			tsset mdate
			tssmooth ma tot_emp_ma = tot_emp, window(1 0 1)
			gen outlier_rate = tot_emp < tot_emp_ma*.75
			sum outlier_rate
			*graph tw (line tot_emp mdate) (scatter tot_emp mdate if outlier_rate == 1)
			*graph export "$figures_tables/cb/quality_checks/tot_emp_`company'_ts.png", replace
			tempfile outliers_rate
			save `outliers_rate', replace
		restore

		* Merge
		merge m:1 mdate using `outliers_ytd', nogen
		merge m:1 mdate using `outliers_rate', nogen
		drop *_ma
		
		* Manually flag additional outliers
		if "`company'" == "11111" {
			replace outlier_ytd = 1 if mdate == tm(2013m1) | mdate == tm(2014m1) | mdate == tm(2017m4) | mdate == tm(2021m2) | mdate == tm(2022m1)
		}
		if "`company'" == "66666" {
			replace outlier_ytd = 1 if mdate < tm(2015m12) | inrange(mdate, tm(2023m1), tm(2023m3))
			replace outlier_rate = 1 if inrange(mdate,tm(2014m2),tm(2014m10)) | mdate == tm(2023m3)
		}
		if "`company'" == "55555" {
			replace outlier_ytd = 1 if mdate == tm(2014m1) | mdate == tm(2018m6) | mdate == tm(2018m7) | mdate == tm(2019m1) | mdate == tm(2020m1)
		}
		if "`company'" == "44444" {
			replace outlier_ytd = 1 if (month == 1 & (year == 2016 | year == 2017 | year == 2021)) | mdate == tm(2023m3) | mdate == tm(2023m2)
		}
		if "`company'" == "22222" {
			replace outlier_rate = 1 if mdate == tm(2016m11) | mdate == tm(2016m12) | mdate == tm(2017m9)
			replace outlier_ytd = 1 if (month == 1 | month == 2 | month == 12) & mdate < tm(2014m12) // Really not sure about this
		}
		if "`company'" == "33333" {
			replace outlier_ytd = 1 if mdate == 636 | mdate == tm(2014m1)
		}
			
		* Make adjustments
		xtset cz mdate
		sort cz mdate
		* YTD variables
		foreach var of varlist tot_freq_gross_pay_ytd avg_monthly_pay* avg_monthly_hours* avg_weekly_hours* avg_grs* avg_ln*_monthly_pay* avg_ln*_monthly_hours* avg_ln*_weekly_hours* avg_ln*_grs*{
			rename `var' `var'_u
			gen `var' = `var'_u
			replace `var' = (L.`var' + F.`var')/2 if outlier_ytd == 1
			if "`company'" == "66666" {
				replace `var' = (`var'[_n+648-mdate]+`var'[_n+658-mdate])/2 if inrange(mdate, 649, 657)
				replace `var' = (`var'[_n+677-mdate]+`var'[_n+680-mdate])/2 if inrange(mdate, 678, 679)
				replace `var' = (`var'[_n+755-mdate]+`var'[_n+759-mdate])/2 if inrange(mdate, 756, 758)
				replace `var' = . if mdate < tm(2015m12)
			}
			if "`company'" == "55555" {
				replace `var' = (`var'[_n+700-mdate]+`var'[_n+703-mdate])/2 if inrange(mdate, 701, 702)
			}
			if "`company'" == "44444" {
				replace `var' = (`var'[_n+756-mdate]+`var'[_n+759-mdate])/2 if inrange(mdate, 757, 758)
			}
			if "`company'" == "22222" {
				replace `var' = (`var'[_n-1]+`var'[_n+3])/2 if (month == 12) & (year == 2014)
				replace `var' = (`var'[_n-2]+`var'[_n+2])/2 if (month == 1) & (year == 2014)
				replace `var' = (`var'[_n-3]+`var'[_n+1])/2 if (month == 2) & (year == 2014)
				replace `var' = `var'[_n+2] if (month == 1) & (year == 2013)
				replace `var' = `var'[_n+1] if (month == 2) & (year == 2013)
			}
		}
		
		* Wage rate variables
		foreach var of varlist tot_emp* n_emp* avg_wage* avg_ln_avg_wage* {
			rename `var' `var'_unadj
			gen `var' = `var'_unadj
			replace `var' = (L.`var' + F.`var')/2 if outlier_rate == 1
			if "`company'" == "66666" {
				replace `var' = (`var'[_n+648-mdate]+`var'[_n+658-mdate])/2 if inrange(mdate, 649, 657)
				replace `var' = (`var'[_n+677-mdate]+`var'[_n+680-mdate])/2 if inrange(mdate, 678, 679)
			}
			if "`company'" == "22222" {
				replace `var' = (`var'[_n+681-mdate]+`var'[_n+684-mdate])/2 if inrange(mdate, 682, 683)
			}
		}

		* Retention Shares
		foreach type in "" "_7_29" {
		
			* Retention as share of current emp
			gen retain_curr`type' = tot_retain`type' / tot_emp`type'_unadj	
			
		}
		
		* Drop retention in first month of 55555
		if "`company'" == "55555" {
			replace retain_curr = . if mdate == tm(2013m9)
		}
		
		* Retention (impute from last year with level shifter)
		sort cz mdate
		
		*tsfill
		
		* Identify outliers
		gen outlier_retain = outlier_rate == 1 | missing(retain_curr) | mdate == tm(2022m6) | mdate == tm(2022m7)
		replace outlier_retain = 1 if L.outlier_rate == 1
		if "`company'" == "66666" {
			replace outlier_retain = 1 if inrange(mdate, 649, 658)
			replace outlier_retain = 1 if inrange(mdate, 678, 680)
		}
		if "`company'" == "22222" {
			replace outlier_retain = 1 if inrange(mdate, 682, 684)
		}
		
		bysort mdate: egen outlier_2 = max(outlier_retain)
		replace outlier_retain = outlier_2
		drop outlier_2
		
		* Indicate if should be used in comparison value for following year
		sort cz mdate
		gen compare = F12.outlier_retain == 0
		
		foreach var of varlist retain_curr* n_retain* {
			
			rename `var' `var'_unadj
			gen `var' = `var'_unadj
			
			foreach year of numlist 2014(1)2023 {
				sum outlier_retain if year == `year'
				if `r(mean)' > 0 {
					
					* Mean for last year
					local i = `year' - 1
					preserve
						keep if year == `i' & compare == 1
						keep cz `var'
						collapse (mean) `var', by(cz)
						rename `var' yr_comp_`var'
						tempfile comparison
						save `comparison', replace
					restore
					
					* Mean for this year
					preserve
						keep if year == `year' & outlier_retain == 0
						keep cz `var'
						collapse (mean) `var', by(cz)
						rename `var' yr_`var'
						tempfile current
						save `current', replace
					restore
					
					merge m:1 cz using `comparison', nogen
					merge m:1 cz using `current', nogen
					
					* Adjust
					xtset cz mdate
					replace `var' = L12.`var'*(yr_`var'/yr_comp_`var') if outlier_retain == 1 & year == `year' & !missing(L12.`var') & !missing(yr_`var') & !missing(yr_comp_`var')
					replace `var' = yr_`var' if outlier_retain == 1 & year == `year' & (missing(L12.`var') | missing(yr_comp_`var'))
					replace `var' = yr_comp_`var' if outlier_retain == 1 & year == `year' & (missing(yr_`var'))
					
					drop yr_`var' yr_comp_`var'
				}
				
			}
		}
		
		* Impute number retained
		rename tot_retain tot_retain_unadj
		gen tot_retain = tot_retain_unadj
		replace tot_retain = tot_emp*retain_curr if outlier_retain == 1
		rename tot_retain_7_29 tot_retain_7_29_unadj
		gen tot_retain_7_29 = tot_retain_7_29_unadj
		replace tot_retain_7_29 = tot_emp_7_29*retain_curr_7_29 if outlier_retain == 1
		
		* Generate retention rate out of last month's employment
		gen retain_last=tot_retain/L.tot_emp
		gen retain_last_7_29=tot_retain_7_29/L.tot_emp_7_29

		* Generate retention rate from next month's retain out of current month's employment
		gen retain_next=F.tot_retain/tot_emp
		gen retain_next_7_29=F.tot_retain_7_29/tot_emp_7_29
		
		* Differences 
		foreach type in "" "_7_29" {
			* Tot emp minus retention
			gen tot_new_hire_ub`type' = tot_emp`type' - tot_retain`type'
			
			* Tot emp last month minus retention
			gen tot_sep_ub`type' = tot_emp`type' - L.tot_retain`type'
		}
		
		* YOY differences ------------------------------------------------------
		
		* Create YOY differences for YTD variables
		foreach var of varlist avg_monthly_pay avg_monthly_hours avg_weekly_hours {
			gen d_`var' = `var' - L12.`var'
			gen d_`var'_u = `var'_u - L12.`var'_u
		}
		
		* Save
		gen cmp_company_code = `company'
		tempfile clean_monthly_`company'
		save `clean_monthly_`company''

}
	* Append
	clear
	foreach company in "11111" "66666" "55555" "44444" "22222" "33333"{
		append using `clean_monthly_`company''
	}	
	
	* Clean --------------------------------------------------------------------
	
	* Merge in CZ characteristics
	rename czone cz
	merge m:1 cz using "$data/opportunity_insights/raw/onlinedata8.dta", keepusing(pop2000 czname stateabbrv) keep (3) nogen // 6 CZs from the OI data are not present in the firm data
	gen czname_long=czname+ ", " + stateabbrv

	* Merge in state MW information
	preserve
	use "$data/minimum_wages/raw/mw_state_stata/mw_state_monthly.dta", clear
	gen year=yofd(dofm(monthly_date)) // Create year variable
	sum monthly_date
	keep if monthly_date==r(max)
	keep if mean_mw==7.25
	keep stateabb
	rename stateabb stateabbrv

	tempfile states_no_mw
	save `states_no_mw'
	restore
	
	merge m:1 stateabbrv using `states_no_mw'
	gen no_state_mw=(_merge==3)
	drop _merge
		
	* Generate log vars
	foreach var of varlist avg_wage* tot_* avg_monthly_pay* avg_monthly_h* avg_grs_total* {
		gen ln_`var' = log(`var')
	}
	
	* Merge in new hires data
	merge 1:1 cmp_company_code mdate cz using "$data/cb/clean_policy_new_hires_cz.dta", keepusing(tot_new_hires tot_new_hires_7_29 ln_tot_new_hires ln_tot_new_hires_7_29 yoy_tot_new_hires yoy_tot_new_hires_7_29 yoy_ln_tot_new_hires yoy_ln_tot_new_hires_7_29 *new_hires_nonpolicy) 
	foreach var of varlist tot_new_hires tot_new_hires_7_29 new_hires_nonpolicy{
		replace `var'=0 if _merge==1
		replace ln_`var'=log(`var')
	}
	drop if _merge==2
	drop _merge
	
	* Merge in separations data
	merge 1:1 cmp_company_code mdate cz using "$data/cb/clean_policy_separations_cz.dta", keepusing(tot_separations sep_to_nonpolicy yoy* ln*) 
	drop if _merge==2
	replace tot_separations=0 if _merge==1 & mdate!=636
	replace sep_to_nonpolicy=0 if _merge==1 
	drop _merge	
	
	* Generate separation rate
	gen yoy_sep_rate=yoy_tot_separations/tot_emp
	
	* Set as panel data
	egen id = group(cmp_company_code cz)
	xtset id mdate

	*gen sep_rate=tot_separations/tot_emp
	gen sep_rate=1-retain_last
	gen sep_rate_7_29=1-retain_last_7_29
	gen sep_to_nonpolicy_rate=sep_to_nonpolicy/tot_emp
	gen sep_rate_next=1-retain_next
	gen sep_rate_next_7_29=1-retain_next_7_29
	
	* Greater than and less than bins
	* Start with lowest bin
	rename n_emp7 wage_7
	rename n_emp7_unadj wage_7_unadj
	gen wage_gt7 = tot_emp
	gen wage_gt7_unadj = tot_emp_unadj
	gen wage_lt7 = 0
	gen wage_lt7_unadj = 0
	gen share_wage_7 = wage_7/tot_emp
	gen share_wage_7_unadj = wage_7_unadj/tot_emp_unadj

	* Do the remaining bins
	foreach num of numlist 8/29 {
		
		* Rename exact bins
		rename n_emp`num' wage_`num'
		rename n_emp`num'_unadj wage_`num'_unadj

		* Create less than and greater than bins (adjusted)
		local prev = `num' - 1
		gen wage_lt`num' = wage_lt`prev' + wage_`prev'
		gen wage_gt`num' = tot_emp - wage_lt`num'
		
		* Create less than and greater than bins (unadjusted)
		local prev = `num' - 1
		gen wage_lt`num'_unadj = wage_lt`prev'_unadj + wage_`prev'_unadj
		gen wage_gt`num'_unadj = tot_emp_unadj - wage_lt`num'_unadj
		
		* Create shares
		foreach var of varlist wage_`num' wage_lt`num' wage_gt`num' {
			gen share_`var' = `var'/tot_emp
			gen share_`var'_unadj = `var'_unadj/tot_emp_unadj
		}
		
	}
	drop wage_lt7* wage_gt7*
	
	
	* Greater than and less than bins
	* Start with lowest bin
	rename n_retain7 retain_7
	rename n_retain7_unadj retain_7_unadj
	gen retain_gt7 = tot_retain
	gen retain_gt7_unadj = tot_retain_unadj
	gen retain_lt7 = 0
	gen retain_lt7_unadj = 0
	gen share_retain_7 = retain_7/tot_emp
	gen share_retain_7_unadj = retain_7_unadj/tot_emp_unadj

	* Do the remaining bins
	foreach num of numlist 8/29 {
		
		* Rename exact bins
		rename n_retain`num' retain_`num'
		rename n_retain`num'_unadj retain_`num'_unadj

		* Create less than and greater than bins (adjusted)
		local prev = `num' - 1
		gen retain_lt`num' = retain_lt`prev' + retain_`prev'
		gen retain_gt`num' = tot_retain - retain_lt`num'
		
		* Create less than and greater than bins (unadjusted)
		local prev = `num' - 1
		gen retain_lt`num'_unadj = retain_lt`prev'_unadj + retain_`prev'_unadj
		gen retain_gt`num'_unadj = tot_retain_unadj - retain_lt`num'_unadj
		
		* Create retention rates
		foreach var in _ _lt _gt {
			gen share_retain`var'`num' = retain`var'`num'/wage`var'`num'
			gen share_retain`var'`num'_unadj = retain`var'`num'_unadj/wage`var'`num'_unadj
		}
		
	}
		
	* Label vars
	label variable cz "commuting zone"
	label variable mdate "month"
	label variable month "month (1-12)"
	label variable year "year"
	label variable outlier_ytd "year-to-date variables adjusted"
	label variable outlier_rate "pay rate variables adjusted"
	label variable outlier_retain "separation rate variables adjusted"
	label variable tot_freq_gross_pay_ytd "# YTD observations"
	label variable cmp_company_code "company code"
	label variable czname "commuting zone name"
	label variable stateabbrv "state"
	label variable pop2000 "CZ population in 2000"
	label variable czname_long "commuting zone, state"
	
	label variable avg_monthly_pay "current average monthly pay (low # obs)"
	label variable avg_monthly_hours "current average monthly hours (low # obs)"
	label variable avg_weekly_hours "current average weekly hours (low # obs)"
	label variable avg_monthly_pay_ytd "average monthly pay YTD"
	label variable avg_monthly_hours_ytd "average monthly hours YTD"
	label variable avg_weekly_hours_ytd "average weekly hours YTD"
	label variable avg_grs_overtime "average gross overtime pay YTD"
	label variable avg_grs_bonus "average gross bonus pay YTD"
	label variable avg_grs_commis "average gross commission pay YTD"
	label variable avg_grs_other "average gross other income YTD"
	label variable avg_grs_total "average gross total pay YTD"
	label variable avg_wage_exact "average hourly wage"
	label variable avg_wage_bins "average hourly wage bin"

	label variable tot_emp "total employment"
	label variable tot_emp_7_29 "total employment <$30/hr"
	label variable tot_retain "total retained employees"
	label variable tot_retain_7_29 "total retained employees <$30/hr"
	label variable tot_new_hire_ub "total employment minus retained employees"
	label variable tot_new_hire_ub_7_29 "total employment minus retained employees making less than $30/hr"
	label variable tot_sep_ub "total employment last month minus retained employees"
	label variable tot_sep_ub_7_29 "total employment last month minus retained employees making less than $30/hr"
	label variable retain_curr "retention rate"
	label variable retain_curr_7_29 "retention rate, <$30/hr"
	label variable retain_last "retention rate (out of last month's emp.)"
	label variable retain_last_7_29 "retention rate (out of last month's emp.), <$30/hr"
	label variable sep_rate "Separation rate"
	label variable sep_to_nonpolicy_rate "Rate of separations to non-policy firms"

	label variable wage_7 "# with wage <$8"
	label variable share_wage_7 "share with wage <$8"
	global wagevars "wage_7 share_wage_7"
	foreach num of numlist 8/29 {
		local plus = `num' + 1
		label variable wage_`num' "# with wage at least $`num' and below $`plus'"
		label variable share_wage_`num' "share with wage at least $`num' and below $`plus'"
		label variable wage_lt`num' "# with wage below $`num'"
		label variable share_wage_lt`num' "share with wage below $`num'"
		label variable wage_gt`num' "# with wage above $`num'"
		label variable share_wage_gt`num' "share with wage above $`num'"
		global wagevars "$wagevars wage_`num' share_wage_`num' wage_lt`num' share_wage_lt`num' wage_gt`num' share_wage_gt`num'"
	}
	
	foreach var of varlist avg_monthly_pay avg_monthly_pay_ytd avg_monthly_hours avg_monthly_hours_ytd avg_weekly_hours_ytd avg_grs_overtime avg_grs_bonus avg_grs_commis avg_grs_other avg_grs_total tot_emp avg_wage_exact avg_wage_bins $wagevars {
		local name: variable label `var'
		label variable `var' "`name' (adjusted)"
		label variable `var'_u "`name' (unadjusted)"
	}
	
	foreach var of varlist avg_wage* tot_* avg_monthly_pay* avg_monthly_h* avg_grs_total* {
		local name: variable label `var'
		label variable ln_`var' "Log `name'"
	}
	
	foreach var of varlist d_* {
		local original = substr("`var'", 3, .)
		local name: variable label `original'
		label variable `var' "YOY diff. in `name'"
	}
		
	* Save
	save "$data/cb/clean_policy_firm_cz.dta", replace
	
* ------------------------------------------------------------------------------
*6b. Create adjusted retain dataset for missings analysis
* ------------------------------------------------------------------------------	
	foreach company in "11111" "55555" "44444" "22222" "33333" {
		
		* Load data
		if "`company'"=="22222"{
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_22222_part1.csv", clear 
			tempfile part1
			save `part1'
			
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_22222_part2.csv", clear 
			append using `part1'		
		}
		else {
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_`company'.csv", clear 
		}
		
		* ID wage bins
		gen wage_bin=subinstr(wage_flag,"wage_","",.)
		replace wage_bin="30" if wage_bin=="gt30"
		replace wage_bin="7" if wage_bin=="lt8"
		replace wage_bin="" if wage_bin=="null"
		destring wage_bin, replace
		drop if wage_bin==.
		drop wage_flag
		
		* Create monthly date and year variables 
		gen str7 str_monthyear = substr(string(archive), 1, 4) + "-" + substr(string(archive), 5, 2) + "-01"
		gen mdate = monthly(str_monthyear, "YM")
		format mdate %tm
		gen month=month(dofm(mdate))
		gen year=yofd(dofm(mdate))
		la var year "Year"
		drop str_monthyear

		* Calculate total retention
		gen n_retain = freq_previousjoin
		bysort cz mdate: egen tot_retain = sum(n_retain)
	
		* Drop highest wage bin
		drop if wage_bin == 30

		* Generate total counts within cz-month --------------------------------
		
		* Calculate total retention with reported wage <30
		gen n_retain_7_29 = freq_previousjoin
		bysort cz mdate: egen tot_retain_7_29 = sum(n_retain_7_29)

		* Collapse to cz-month -------------------------------------------------
		
		collapse (mean) tot_retain*, by(mdate year month cz)
		
		* Adjust retention ----------------------------------------------------
		
		xtset cz mdate
		tsfill
		
		gen outlier_retain = 0
		gen tot_retain_unadj = tot_retain
		gen tot_retain_7_29_unadj = tot_retain_7_29
		if "`company'" == "33333" {
			replace outlier_retain = 1 if inrange(mdate, 683, 684) | inrange(mdate, 695, 696) | inrange(mdate, 749, 750)
			xtset cz mdate
			sort cz mdate
			foreach type in "" "_7_29" {
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+2])/2 if mdate == 683 | mdate == 695 | mdate == 749
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+1])/2 if mdate == 684 | mdate == 696 | mdate == 750
			}
		}
		if "`company'" == "22222" {
			replace outlier_retain = 1 if inrange(mdate, 682, 684) | inrange(mdate, 695, 696) | inrange(mdate, 749, 750) | inrange(mdate, 755, 756)
			xtset cz mdate
			sort cz mdate
			foreach type in "" "_7_29" {
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+2])/2 if mdate == 755 | mdate == 695 | mdate == 749
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+1])/2 if mdate == 756 | mdate == 696 | mdate == 750
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+3])/2 if mdate == 682
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+2])/2 if mdate == 683
				replace tot_retain`type' = (tot_retain`type'[_n-3]+tot_retain`type'[_n+1])/2 if mdate == 684
			}
		}
		if "`company'" == "44444" {
			replace outlier_retain = 1 if inrange(mdate, 683, 684) | inrange(mdate, 695, 696) | inrange(mdate, 749, 750) | inrange(mdate, 755, 756)
			xtset cz mdate
			sort cz mdate
			foreach type in "" "_7_29" {
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+2])/2 if mdate == 683 | mdate == 695 | mdate == 749 | mdate == 755
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+1])/2 if mdate == 684 | mdate == 696 | mdate == 750 | mdate == 756
			}
		}
		if "`company'" == "55555" {
			replace outlier_retain = 1 if mdate == 644 | inrange(mdate, 683, 684) | inrange(mdate, 749, 750)
			xtset cz mdate
			sort cz mdate
			foreach type in "" "_7_29" {
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+2])/2 if mdate == 683 | mdate == 749
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+1])/2 if mdate == 684 | mdate == 750
				replace tot_retain`type' = . if mdate == 644	
			}
		}
		if "`company'" == "11111" {
			replace outlier_retain = 1 if inrange(mdate, 749, 750)
			xtset cz mdate
			sort cz mdate
			foreach type in "" "_7_29" {
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+2])/2 if mdate == 749
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+1])/2 if mdate == 750
			}
		}
		
		* Save
		gen cmp_company_code = `company'
		tempfile clean_monthly_`company'
		save `clean_monthly_`company''

}
	* Append
	clear
	foreach company in "11111" "55555" "44444" "22222" "33333"{
		append using `clean_monthly_`company''
	}	
	
	* Generate separation
	rename czone cz
	merge 1:1 cz mdate cmp_company_code using "$data/cb/clean_policy_firm_cz.dta", nogen keepusing(tot_emp tot_emp_7_29)
	egen id = group(cz cmp)
	xtset id mdate
	gen sep_rate = 1-(tot_retain/L.tot_emp)
	gen sep_rate_7_29 = 1-(tot_retain_7_29/L.tot_emp_7_29)
	
	* Save
	save "$data/cb/adj_retain.dta", replace
	
* ------------------------------------------------------------------------------
*6c. Create separation rate variable
* ------------------------------------------------------------------------------
foreach company in "11111" "55555" "44444" "22222" "33333" {
		
		* Load data
		if "`company'"=="22222"{
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_22222_part1.csv", clear 
			tempfile part1
			save `part1'
			
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_22222_part2.csv", clear 
			append using `part1'		
		}
		else {
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_`company'.csv", clear 
		}
		
		* ID wage bins
		gen wage_bin=subinstr(wage_flag,"wage_","",.)
		replace wage_bin="30" if wage_bin=="gt30"
		replace wage_bin="7" if wage_bin=="lt8"
		replace wage_bin="" if wage_bin=="null"
		destring wage_bin, replace
		drop if wage_bin==.
		drop wage_flag
		
		* Create monthly date and year variables 
		gen str7 str_monthyear = substr(string(archive), 1, 4) + "-" + substr(string(archive), 5, 2) + "-01"
		gen mdate = monthly(str_monthyear, "YM")
		format mdate %tm
		gen month=month(dofm(mdate))
		gen year=yofd(dofm(mdate))
		la var year "Year"
		drop str_monthyear

		* Calculate total retention
		gen n_retain = freq_previousjoin
		bysort cz mdate: egen tot_retain = sum(n_retain)
	
		* Drop highest wage bin
		drop if wage_bin == 30

		* Generate total counts within cz-month --------------------------------
		
		* Calculate total retention with reported wage <30
		gen n_retain_7_29 = freq_previousjoin
		bysort cz mdate: egen tot_retain_7_29 = sum(n_retain_7_29)

		* Collapse to cz-month -------------------------------------------------
		
		collapse (mean) tot_retain*, by(mdate year month cz)
		
		* Adjust retention -----------------------------------------------------
		
		xtset cz mdate
		tsfill
		
		gen outlier_retain = 0
		gen tot_retain_unadj = tot_retain
		gen tot_retain_7_29_unadj = tot_retain_7_29
		if "`company'" == "33333" {
			replace outlier_retain = 1 if inrange(mdate, 683, 684) | inrange(mdate, 695, 696) | inrange(mdate, 749, 750)
			xtset cz mdate
			sort cz mdate
			foreach type in "" "_7_29" {
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+2])/2 if mdate == 683 | mdate == 695 | mdate == 749
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+1])/2 if mdate == 684 | mdate == 696 | mdate == 750
			}
		}
		if "`company'" == "22222" {
			replace outlier_retain = 1 if inrange(mdate, 682, 684) | inrange(mdate, 695, 696) | inrange(mdate, 749, 750) | inrange(mdate, 755, 756)
			xtset cz mdate
			sort cz mdate
			foreach type in "" "_7_29" {
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+2])/2 if mdate == 755 | mdate == 695 | mdate == 749
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+1])/2 if mdate == 756 | mdate == 696 | mdate == 750
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+3])/2 if mdate == 682
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+2])/2 if mdate == 683
				replace tot_retain`type' = (tot_retain`type'[_n-3]+tot_retain`type'[_n+1])/2 if mdate == 684
			}
		}
		if "`company'" == "44444" {
			replace outlier_retain = 1 if inrange(mdate, 683, 684) | inrange(mdate, 695, 696) | inrange(mdate, 749, 750) | inrange(mdate, 755, 756)
			xtset cz mdate
			sort cz mdate
			foreach type in "" "_7_29" {
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+2])/2 if mdate == 683 | mdate == 695 | mdate == 749 | mdate == 755
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+1])/2 if mdate == 684 | mdate == 696 | mdate == 750 | mdate == 756
			}
		}
		if "`company'" == "55555" {
			replace outlier_retain = 1 if mdate == 644 | inrange(mdate, 683, 684) | inrange(mdate, 749, 750)
			xtset cz mdate
			sort cz mdate
			foreach type in "" "_7_29" {
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+2])/2 if mdate == 683 | mdate == 749
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+1])/2 if mdate == 684 | mdate == 750
				replace tot_retain`type' = . if mdate == 644	
			}
		}
		if "`company'" == "11111" {
			replace outlier_retain = 1 if inrange(mdate, 749, 750)
			xtset cz mdate
			sort cz mdate
			foreach type in "" "_7_29" {
				replace tot_retain`type' = (tot_retain`type'[_n-1]+tot_retain`type'[_n+2])/2 if mdate == 749
				replace tot_retain`type' = (tot_retain`type'[_n-2]+tot_retain`type'[_n+1])/2 if mdate == 750
			}
		}
		
		* Save
		gen cmp_company_code = `company'
		tempfile clean_monthly_`company'
		save `clean_monthly_`company''

}
	* Append
	clear
	foreach company in "11111" "55555" "44444" "22222" "33333"{
		append using `clean_monthly_`company''
	}	
	
	* Generate separation
	rename czone cz
	merge 1:1 cz mdate cmp_company_code using "$data/cb/clean_policy_firm_cz.dta", nogen keepusing(tot_emp tot_emp_7_29)
	egen id = group(cz cmp)
	xtset id mdate
	gen sep_rate = 1-(tot_retain/L.tot_emp)
	gen sep_rate_7_29 = 1-(tot_retain_7_29/L.tot_emp_7_29)
	
	keep sep_rate_7_29 outlier_retain 
	
	* Save
	save "$data/cb/clean_policy_separation_rate_7_29_cz.dta", replace
	
* ------------------------------------------------------------------------------
*7. Previous industry of new hires dataset for policy employers
* ------------------------------------------------------------------------------
	* Get 2-digit NAICS codes and names from census.gov (URL: https://www.census.gov/programs-surveys/economic-census/year/2022/guidance/understanding-naics.html)
	insheet using "$data/cb/documentation/naics2d_names.csv", clear names
	tempfile naics2d_names
	save `naics2d_names'
	
	* Get 3-digit NAICS codes and names
	import excel using "$data/cb/raw/[Project Name].xlsx", sheet("Avg. Hourly Pay NAICS") cellrange(A4:B92) clear firstrow
	drop if er_naics==.
	rename er_naics _er_naics
	rename NAICSD er_naics_name
	tempfile industry_names
	save `industry_names'	
	
	* Read in raw data on new hires at policy employers from Credit Bureau
	insheet using "$data/cb/raw/[Project Name]_result1_3_state.csv", clear 

	* Create monthly date and year variables 
	gen str7 str_monthyear = substr(string(wt_original_hire_date), 1, 4) + "-" + substr(string(wt_original_hire_date), 5, 2) + "-01"
	gen mdate = monthly(str_monthyear, "YM")
	format mdate %tm
	gen month=month(dofm(mdate))
	gen year=yofd(dofm(mdate))
	la var year "Year"
	
	* Flag missing industry
	replace _er=999 if _er==.
	
	* Gen 2-digit NAICS variable
	gen naics2d = substr(string(_er),1,2)
	replace naics2d="31-33" if naics2d=="31" | naics2d=="32" | naics2d=="33" 
	replace naics2d="44-45" if naics2d=="44" | naics2d=="45" 
	replace naics2d="48-49" if naics2d=="48" | naics2d=="49"
	
	merge m:1 naics2d using `naics2d_names'
	replace naics2d_name="Missing" if naics2d_name==""
	
	drop _merge source

	* Add 3-digit NAICS names
	merge m:1 _er_naics using `industry_names'

	* Drop obs whose previous company is current company (1.77% of observations)
	drop if curreqlpre==1

	* Generate total new hires variable 
	egen new_hires=rowtotal(wage_lt30 wage_30)
	la var new_hires "New hires"	
	la var new_hires "Total new hires by 2-digit NAICS"
	
	* Total new hires by year and industry of origin
	collapse (sum) new_hires, by(year _er_naics er_naics_name cmp_company_code)
		
	* Total new hires across all industries
	egen tot_new_hires=sum(new_hires),by(cmp year)
	la var tot_new_hires "Total new hires"
	gen share_new_hires=new_hires/tot_new_hires	
	la var share_new_hires "Share of new hires by 2-digit NAICS"
	egen rank = rank(-new) , by(year cmp)
	la var rank "Rank of new hire previous industry share"	
	
	la var _er_naics "3-digit NAICS code"
	la var er_naics_name "2-digit NAICS name"	
	
	* Save previous industry of new hires dataset for policy employers
	save "$data/cb/clean_policy_firm_new_hires.dta", replace

* ------------------------------------------------------------------------------
*8.  Clean dataset on previous company of policy company new hires
* ------------------------------------------------------------------------------
	* Read in separations file for nonpolicy companies
	insheet using "$data/cb/raw/[Project Name]_result1_7_czone_nonpol_pol.csv", clear
	
	* Create monthly date and year variables 
	gen str7 str_monthyear = substr(string(wt_), 1, 4) + "-" + substr(string(wt_), 5, 2) + "-01"
	gen mdate = monthly(str_monthyear, "YM")
	format mdate %tm
	gen month=month(dofm(mdate))
	gen year=yofd(dofm(mdate))
	
	collapse (sum) freq,by(year cmp*previous cmp*new )	
	rename cmp_company_code_previous cmp_company_code	
	drop if cmp_company_code_new=="other"
	destring cmp_company_code_new, replace
	rename cmp_company_code _cmp_company_code
	rename cmp_company_code_new cmp_company_code
	rename freq separations
	
	tempfile separations
	save `separations'

	* Get 2-digit NAICS codes and names from census.gov
	* URL: https://www.census.gov/programs-surveys/economic-census/year/2022/guidance/understanding-naics.html
	insheet using "$data/cb/documentation/naics2d_names.csv", clear names
	drop source
	tempfile naics2d_names
	save `naics2d_names'
	
	* Get 3-digit NAICS codes and names
	import excel using "$data/cb/raw/[Project Name].xlsx", sheet("Avg. Hourly Pay NAICS") cellrange(A4:B92) clear firstrow
	drop if er_naics==.
	rename er_naics _er_naics
	rename NAICSD er_naics_name
	tempfile industry_names
	save `industry_names'	
	
	* Read in raw data on new hires at policy employers from Credit Bureau
	insheet using "$data/cb/raw/[Project Name]_result1_3_precomp.csv", clear 
	
	* Clean variable names and labels
	la var _er_naics "Non-policy company 3-digit NAICS code"
	la var cmp_company_code "Policy company code"
	la var _cmp_company_code "Non-policy company code"
	rename wt_original_hire_date_yyyy year
	la var year "Year"
	
	* Flag missing industry
	replace _er_naics=999 if _er_naics==.
	
	* Gen 2-digit NAICS variable
	gen naics2d = substr(string(_er_naics),1,2)
	replace naics2d="31-33" if naics2d=="31" | naics2d=="32" | naics2d=="33" 
	replace naics2d="44-45" if naics2d=="44" | naics2d=="45" 
	replace naics2d="48-49" if naics2d=="48" | naics2d=="49"
	la var naics2d "Non-policy company 2-digit NAICS code"
	
	* Merge 2-digit NAICS names
	merge m:1 naics2d using `naics2d_names', nogen
	replace naics2d_name="Missing" if naics2d_name==""
	la var naics2d_name "Non-policy company 2-digit NAICS name"	

	* Merge 3-digit NAICS names
	merge m:1 _er_naics using `industry_names'
	la var er_naics_name "Non-policy company 3-digit NAICS name"
	drop _merge

	* Generate total new hires variable 
	rename freq new_hires
	la var new_hires "Total new hires at policy company from nonpolicy company"
		
	* Total new hires across all industries
	egen tot_new_hires=sum(new_hires), by(cmp year)
	la var tot_new_hires "Total new hires at policy company in year"
	gen share_new_hires=new_hires/tot_new_hires	
	la var share_new_hires "New hires coming from nonpolicy company / Total new hires at policy company"
	egen rank = rank(-new) , by(year cmp)
	la var rank "Rank of new hire previous company share, within year and policy company"	
	
	* Save previous industry of new hires dataset for policy employers
	merge m:1 cmp _cmp year using `separations', nogen
	save "$data/cb/clean_policy_firm_new_hires_previous_company.dta", replace

* ------------------------------------------------------------------------------
*9.  Get exact average hourly wage for non-policy companies
* ------------------------------------------------------------------------------

* Read in cmp_company_code, czone, archive, wage_flag, total_gross_pay_rate, and freq_total_gross_pay_rate. 
* Create avg_wage_exact by taking the weighted average of average pay rate across bins. 
* Save file at cmp_company_code, cz, mdate level with avg_wage_exact. 

	clear
	
* Load data --------------------------------------------------------------------
	
	* 2013
	foreach monthset in "201301_201303" "201304_201305" "201306_201307" "201308_201309" "201310_201311" "201312_201401" {
		
		insheet using "$data/cb/raw/[Project Name]_result1_2_czone_nonpol_DETAIL/[Project Name]_result1_2_czone_nonpol_`monthset'.csv", clear
		keep cmp_company_code czone archive wage_flag total_gross_pay_rate freq_gross_pay_rate
			
		tempfile mn`monthset'
		save `mn`monthset''
		
	} 
	clear
	foreach monthset in "201301_201303" "201304_201305" "201306_201307" "201308_201309" "201310_201311" "201312_201401" {
			
		append using `mn`monthset''
		erase `mn`monthset''
			
	}
	save "$data/cb/temp/raw_nonpol_2013", replace
	
	* 2014
	foreach monthset in "201402_201403" "201404_201405" "201406_201407" "201408_201409" "201410_201411" "201412_201501" {
		
		insheet using "$data/cb/raw/[Project Name]_result1_2_czone_nonpol_DETAIL/[Project Name]_result1_2_czone_nonpol_`monthset'.csv", clear
		keep cmp_company_code czone archive wage_flag total_gross_pay_rate freq_gross_pay_rate
			
		tempfile mn`monthset'
		save `mn`monthset''
		
	} 
	clear
	foreach monthset in "201402_201403" "201404_201405" "201406_201407" "201408_201409" "201410_201411" "201412_201501" {
			
		append using `mn`monthset''
		erase `mn`monthset''
			
	}
	save "$data/cb/temp/raw_nonpol_2014", replace
	
	
	* 2015
	foreach monthset in "201502_201503" "201504_201505" "201506_201507" "201508_201509" "201510_201511" "201512_201601" {
		
		insheet using "$data/cb/raw/[Project Name]_result1_2_czone_nonpol_DETAIL/[Project Name]_result1_2_czone_nonpol_`monthset'.csv", clear
		keep cmp_company_code czone archive wage_flag total_gross_pay_rate freq_gross_pay_rate
			
		tempfile mn`monthset'
		save `mn`monthset''
		
	} 
	clear
	foreach monthset in "201502_201503" "201504_201505" "201506_201507" "201508_201509" "201510_201511" "201512_201601" {
			
		append using `mn`monthset''
		erase `mn`monthset''
			
	}
	save "$data/cb/temp/raw_nonpol_2015", replace
	
	* 2016
	foreach monthset in "201602_201603" "201604_201605" "201606_201607" "201608_201608" "201609_201609" "201610_201610" "201611_201611" "201612_201701" {
		
		insheet using "$data/cb/raw/[Project Name]_result1_2_czone_nonpol_DETAIL/[Project Name]_result1_2_czone_nonpol_`monthset'.csv", clear
		keep cmp_company_code czone archive wage_flag total_gross_pay_rate freq_gross_pay_rate
			
		tempfile mn`monthset'
		save `mn`monthset''
		
	} 
	clear
	foreach monthset in "201602_201603" "201604_201605" "201606_201607" "201608_201608" "201609_201609" "201610_201610" "201611_201611" "201612_201701"  {
			
		append using `mn`monthset''
		erase `mn`monthset''
			
	}
	save "$data/cb/temp/raw_nonpol_2016", replace
	
	* 2017
	foreach monthset in "201702_201703" "201704_201704" "201705_201705" "201706_201706" "201707_201707" "201708_201708" "201709_201709" "201710_201710" "201711_201711" "201712_201801" {
		
		insheet using "$data/cb/raw/[Project Name]_result1_2_czone_nonpol_DETAIL/[Project Name]_result1_2_czone_nonpol_`monthset'.csv", clear
		keep cmp_company_code czone archive wage_flag total_gross_pay_rate freq_gross_pay_rate
			
		tempfile mn`monthset'
		save `mn`monthset''
		
	} 
	clear
	foreach monthset in "201702_201703" "201704_201704" "201705_201705" "201706_201706" "201707_201707" "201708_201708" "201709_201709" "201710_201710" "201711_201711" "201712_201801"  {
			
		append using `mn`monthset''
		erase `mn`monthset''
			
	}
	save "$data/cb/temp/raw_nonpol_2017", replace
	
	* 2018
	foreach monthset in "201802_201802" "201803_201803" "201804_201804" "201805_201805" "201806_201806" "201807_201807" "201808_201808" "201809_201809" "201810_201810" "201811_201811" "201812_201812" {
		
		insheet using "$data/cb/raw/[Project Name]_result1_2_czone_nonpol_DETAIL/[Project Name]_result1_2_czone_nonpol_`monthset'.csv", clear
		keep cmp_company_code czone archive wage_flag total_gross_pay_rate freq_gross_pay_rate
			
		tempfile mn`monthset'
		save `mn`monthset''
		
	} 
	clear
	foreach monthset in "201802_201802" "201803_201803" "201804_201804" "201805_201805" "201806_201806" "201807_201807" "201808_201808" "201809_201809" "201810_201810" "201811_201811" "201812_201812"  {
			
		append using `mn`monthset''
		erase `mn`monthset''
			
	}
	save "$data/cb/temp/raw_nonpol_2018", replace
	
	* 2019 - 2022
	foreach year of numlist 2019/2022 {
		
		foreach month of numlist 1/12 {
			
			local mn "`month'" 
			if `month' < 10 {
				local mn "0`month'"
			}
		
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_nonpol_DETAIL/[Project Name]_result1_2_czone_nonpol_`year'`mn'_`year'`mn'.csv", clear
			keep cmp_company_code czone archive wage_flag total_gross_pay_rate freq_gross_pay_rate
			
			tempfile mn`year'`month'
			save `mn`year'`month''
			
			di "`mn' `year' done"
			
		}

		clear
		foreach month of numlist 1/12 {
			
			append using `mn`year'`month''
			erase `mn`year'`month''
			
		}
		
	save "$data/cb/temp/raw_nonpol_`year'", replace
	
	}
	
	* 2023
	foreach month of numlist 1/8 {
			
		local mn "`month'" 
		if `month' < 10 {
			local mn "0`month'"
		}
		
		insheet using "$data/cb/raw/[Project Name]_result1_2_czone_nonpol_DETAIL/[Project Name]_result1_2_czone_nonpol_2023`mn'_2023`mn'.csv", clear
		keep cmp_company_code czone archive wage_flag total_gross_pay_rate freq_gross_pay_rate
			
		tempfile mn2023`month'
		save `mn2023`month''
			
		di "`mn' 2023 done"
			
	}

	clear
	foreach month of numlist 1/8 {
			
		append using `mn2023`month''
		erase `mn2023`month''
			
	}
	save "$data/cb/temp/raw_nonpol_2023", replace

* Get average wage -------------------------------------------------------------
	
	* Store tempfile of archive to mdate crosswalk
	insheet using "$data/cb/documentation/archive_mdate_crosswalk.csv", clear
	tempfile archive_mdate_crosswalk
	save `archive_mdate_crosswalk'
	
	* Clean by year
	foreach year of numlist 2013/2023 {
		
		* Load data
		use "$data/cb/temp/raw_nonpol_`year'"
		
		* Get mdate
		merge m:1 archive using `archive_mdate_crosswalk', keep(1 3) nogen
		drop archive
		
		* Drop highest wage bin
		drop if wage_flag == "wage_gt30"
		
		* Collapse
		gen avg_wage_exact = total_gross_pay_rate/freq_gross_pay_rate
		gen avg_wage_exact_gt8 = avg_wage_exact if wage_flag != "wage_lt8"
		collapse (mean) avg_wage_exact* [aw = freq_gross_pay_rate], by(cmp_company_code cz mdate)
		
		* Save
		tempfile yr`year'
		save `yr`year'', replace
		
	}
	
	* Append
	clear
	foreach year of numlist 2013/2023 {
	
		append using `yr`year''
		erase `yr`year''
	
	}
	
	* Save
	rename czone cz
	save "$data/cb/nonpol_avg_wage_exact", replace

* 
* ------------------------------------------------------------------------------
*10. Get average wage rate of new hires for non-policy_firm
* ------------------------------------------------------------------------------	
* Generate average wage among new hires for non-policy companies

	foreach dateset in "201301_201712" "201801_202112" "202201_202308" {
		insheet using "$data/cb/raw/[Project Name]_result1_3_czone_allcompany_`dateset'.csv", clear
			drop if inlist(cmp_company_code,22222, 44444, 55555, 66666, 33333, 11111 )
		tempfile newhires_`dateset'
		save `newhires_`dateset'', replace
	}
	clear
	foreach dateset in "201301_201712" "201801_202112" "202201_202308" {
		append using `newhires_`dateset''
	}
			
	* Create monthly date and year variables 
	gen str7 str_monthyear = substr(string(wt_original), 1, 4) + "-" + substr(string(wt_original), 5, 2)
	drop if strlen(string(wt_original)) == 5 // duplicates
	gen mdate = monthly(str_monthyear, "YM")
	format mdate %tm
	gen month=month(dofm(mdate))	
	gen year=yofd(dofm(mdate))	
	
	* Rename CZ variable
	rename czone cz
	
	* Generate share at wage bin variables
	rename wage_lt8 wage_7
	rename wage_gt30 wage_30
	
	* Gen tot emp and tot emp under $30
	egen tot_emp = rowtotal(wage_*)
	
	egen tot_emp_7_29 = rowtotal(wage_7 wage_8 wage_9 wage_10 wage_11 wage_12 wage_13 wage_14 wage_15 wage_16 wage_17 wage_18 wage_19 wage_20 wage_21 wage_22 wage_23 wage_24 wage_25 wage_26 wage_27 wage_28 wage_29)
	
	* Generate average wage among workers earning < $30
	foreach num of numlist 7/29 {
		gen avgwage_`num' = `num'*wage_`num'
	}
	egen avg_wage_bins_nh = rowtotal(avgwage_*)
	replace avg_wage_bins_nh = avg_wage_bins_nh/tot_emp_7_29
	gen ln_avg_wage_bins_nh= log(avg_wage_bins_nh)
	
	keep avg_wage_bins_nh ln_avg_wage_bins_nh cmp_company_code mdate cz
	
	save "$data/cb/clean_avg_wage_new_hires_nonpolicy_firm_cz.dta", replace
	
* ------------------------------------------------------------------------------
*11. Minimally cleaned CZ-level dataset on non-policy employers for summary stats
* ------------------------------------------------------------------------------	
	* Get 3-digit NAICS codes and names
	import excel using "$data/cb/raw/[Project Name].xlsx", sheet("Avg. Hourly Pay NAICS") cellrange(A4:B92) clear firstrow
	drop if er_naics==.
	rename NAICSD er_naics_name
	tempfile industry_names
	save `industry_names'	
	
	* Get NAICS codes for companies
	insheet using  "$data/cb/raw/[Project Name]_result1_8_ernaics.csv", clear
	contract cmp_company_code er_naics
	quietly bysort cmp_company_code:  gen dup = cond(_N==1,0,_n)
	tab dup
	drop if dup>1
	tempfile cmp_naics
	save `cmp_naics'
	
	foreach yearset in "2013_2014" "2015" "2016" "2017" "2018" "2019" "2020" "2021" "2022" "2023" {
	
		insheet using "$data/cb/raw/[Project Name]_result1_2_czone_nonpol_`yearset'.csv", clear
		
		* Merge in data on company industry
		merge m:1 cmp_company_code using `cmp_naics' , keepusing(er_naics) 
		drop if _merge==2
		drop _merge
		
		* Identify main feeder industries to policy firms
		gen feeder_ind= inlist(er_naics,561,452,722,445,448,492,454,444,451)
		gen poaching_ind = inlist(er_naics,561,722,445,452,448,492,444,622,541,451,441,611,453,311)
		gen sample_ind=(feeder_ind ==1 | poaching_ind==1)
			
		* Create monthly date and year variables 
		gen str7 str_monthyear = substr(string(archive), 1, 4) + "-" + substr(string(archive), 5, 2) + "-01"
		gen mdate = monthly(str_monthyear, "YM")
		format mdate %tm
		gen month=month(dofm(mdate))
		gen year=yofd(dofm(mdate))	
		
		* Remove NAICS greater than 3-digits (not clear what these are)
		*drop if length(string(er_naics))>3
			// DO WE WANT THIS NOW?
		
		* Rename CZ variable
		rename czone cz
		
		* Generate total employment variable
		egen tot_emp=rowtotal(wage_lt8 wage_8 wage_9 wage_10 wage_11 wage_12 wage_13 wage_14 wage_15 wage_16 wage_17 wage_18 wage_19 wage_20 wage_21 wage_22 wage_23 wage_24 wage_25 wage_26 wage_27 wage_28 wage_29 wage_gt30)		
		la var tot_emp "Non-policy company total employment"
		
		tempfile nonpol_`yearset'
		save `nonpol_`yearset''		
	}
	
	clear
	
	foreach yearset in "2013_2014" "2015" "2016" "2017" "2018" "2019" "2020" "2021" "2022" "2023" {
		append using `nonpol_`yearset''
	}	
	
	* Merge with avg wage data
	merge 1:1 cz cmp_company_code mdate using "$data/cb/nonpol_avg_wage_exact", nogen // MEA EDIT

	* Merge with new hire wage
	merge 1:1 cz cmp_company_code mdate using "$data/cb/clean_avg_wage_new_hires_nonpolicy_firm_cz.dta", nogen keepusing(ln_avg_wage_bins_nh avg_wage_bins_nh)

	* Drop null wages
	drop wage_null
	
	* Identify large firms
	egen max_emp=max(tot_emp),by(cmp_company_code)
	label variable max "Non-policy company max employment in CZ month"
	
	* Variable labels
	label variable archive "Month of record"
	label variable cz "Commuting zone"
	label variable wage_lt8 "Number employees making less than $8/hr"
	foreach num of numlist 8/29 {
		local plus = `num' + 1
		label variable wage_`num' "# with wage at least $`num' and below $`plus'"
	}
	label variable wage_gt30 "Number employees making at least $30/hr"
	label variable freq "Total recorded employees with hourly wage"
	label variable sample_ind "Industry among top 9 industries with flows to policy companies"
	label variable str_monthyear "Month string"
	label variable mdate "Month"
	label variable month "Month (1-12)"
	label variable year "Year"

	save "$data/cb/raw_non_policy_firm_dataset.dta", replace
	
* ------------------------------------------------------------------------------
*12. CZ-level monthly wage distributions dataset for non-policy employers
* ------------------------------------------------------------------------------	
		* Load
		use "$data/cb/raw_non_policy_firm_dataset.dta", clear
	
		* Get industries
		preserve
			import excel using "$data/cb/raw/[Project Name].xlsx", sheet("Avg. Hourly Pay NAICS") cellrange(A4:B92) clear firstrow
			drop if er_naics==.
			rename NAICSD er_naics_name
			tempfile industry_names
			save `industry_names'
		restore
		preserve
			keep cmp_company_code er_naics
			duplicates drop
			merge m:1 er_naics using `industry_names', nogen
			bysort cmp_company_code: gen temp = _N
			replace er_naics_name = "Multiple industries" if temp > 1
			drop temp er_naics
			duplicates drop
			tempfile cmp_industry
			save `cmp_industry'
		restore
		merge m:1 cmp_company_code using `cmp_industry', keep(1 3) nogen
		replace er_naics_name = "Not listed" if er_naics_name == ""
	
		* Collapse
		rename freq emp
		rename max_emp max_emp_cz
		collapse (sum) emp (mean) max_emp, by(mdate cmp_company_code er_naics_name)
		
		* Set as panel
		xtset cmp_company_code mdate
	
	* Flags
		
		* Low reporting
		sort cmp_comp mdate
		gen low_report = emp < .5*L.emp & emp < .5*F.emp & !missing(L.emp) & !missing(F.emp)
		bysort cmp_company_code: egen months_low = total(low_report)
		* Create adjustment
		tssmooth ma emp_ma = emp, window(1 0 1)
		gen emp_adj = emp
		replace emp_adj = emp_ma if low_report == 1
		
		* Big jump (potentially persistent)
		sort cmp_comp mdate
		gen d_emp_adj = emp_adj[_n-1] - emp_adj if cmp_company_code[_n-1] == cmp_company_code
		by cmp_company_code: egen sd_d_emp_adj = sd(d_emp_adj)
		by cmp_company_code: egen mean_d_emp_adj = mean(d_emp_adj)
		gen z_d_emp_adj = (d_emp_adj - mean_d_emp_adj)/sd_d_emp_adj
		sort cmp_comp mdate
		by cmp_company_code: gen d_emp_large = abs(z_d_emp_adj) > 2*sqrt(_N/12) & !missing(z_d_emp_adj)
		bysort cmp_company_code: egen max_d_emp = max(abs(z_d_emp_adj))
		bysort cmp_company_code: egen months_d_emp_large = sum(d_emp_large)

		* Small firm
		bysort cmp_company_code: egen max_emp_nat = max(emp)
		preserve
			collapse (mean) max_emp_nat, by(cmp_company_code)
			sum max_emp_nat, d
		restore
		gen small_firm = max_emp_nat < `r(p10)'
		
		* Num. months with positive reporting
		gen pos_reporting = emp > 0 & !missing(emp)
		bysort cmp_company_code: egen obs_nat = total(pos_reporting)
		
		* Define spell with consecutive months with positive reporting and no big jump
		gen spell_start = L.mdate != mdate - 1 | L.emp == 0 | L.emp == . | emp == 0 | emp == . | d_emp_large == 1 | (abs(L.emp - emp)<0.1 & abs(F.emp - emp)<0.1) | (abs(L2.emp - emp)<0.1 & abs(L.emp - emp)<0.1) | (abs(F2.emp - emp)<0.1 & abs(F.emp - emp)<0.1)
		sort cmp_company_code mdate
		bysort cmp_company_code: gen spell_id = sum(spell_start)
		bysort spell_id cmp_company_code: gen spell_length = _N
		bysort cmp_company_code: egen max_spell = max(spell_length)

	* Define sample
	
		cap drop insample*
		gen insample = 1
		
		* Num. months with low reporting
		replace insample = 0 if months_low > 6

		* Max number of big jumps 
		replace insample = 0 if months_d_emp_large > 4
		
		* Firm size
		preserve
			collapse (mean) max_emp_nat, by(cmp_company_code)
			sum max_emp_nat, d
		restore
		replace insample = 0 if max_emp_nat < `r(p10)'
		
		* Consistent (continuous) reporting for 12 months
		replace insample = 0 if spell_length < 12
		
		* Calculate percent in sample
		bysort cmp_company_code: egen insample_pct = mean(insample)
			
	* Manually exclude firms from sample based on graphs
	
	* Clean up
	
		drop emp_ma d_emp_adj sd_d_emp_adj mean_d_emp_adj pos_reporting
		
		label variable emp "Total national employment at company"
		label variable emp_adj "Total national employment at company - adjusted"
		
		label variable max_emp_cz "Max. CZ-month employment for company"
		label variable max_emp_nat "Max. national monthly employment for company"
		label variable small_firm "Max. national monthly employment less than 10th percentile of companies"
		
		label variable low_report "Low reporting month = variables adjusted"
		label variable months_low "# months company has abnormally low reporting"
		
		label variable z_d_emp_adj "Z-score of change in adj. national employment by company"
		label variable max_d_emp "Max. z-score of change in adj. national employment by company"
		label variable d_emp_large "Unusually large change in adj. national employment"
		label variable months_d_emp_large "Months with unusually large change in adj. national employment"
		
		label variable obs_nat "Number of months with any reported employment at company"
		
		label variable spell_id "ID of spell of reporting - use with cmp_company_code"
		label variable spell_start "Start of new spell of reporting"
		label variable spell_length "Length of spell of consistent reporting"
		label variable max_spell "Max. length of spell of consistent reporting by company"
		
		label variable insample "Observation in sample of reasonable spells"
		label variable insample_pct "Percent of observed months at company that are in sample"
			
	* Save
	
		save "$data/cb/insample_non_policy_firm_dataset.dta", replace

* Merge with CZ-month-company panel and adjust ---------------------------------

* ------------------------------------------------------------------------------
* From 1c lines 1306-1433
* CZ-level monthly wage distributions dataset for non-policy employers
* ------------------------------------------------------------------------------	
	use "$data/cb/raw_non_policy_firm_dataset.dta", clear

	* Identify outliers to be adjusted and sample indicator
	merge m:1 mdate cmp_company_code using "$data/cb/insample_non_policy_firm_dataset.dta", keep(1 3) nogen
	drop emp emp_adj

	* Set as panel data
	egen id = group(cmp_company_code cz)
	xtset id mdate

	* Adjust in low reporting months
	foreach var of varlist wage* tot_emp {
		gen adj_`var' = `var'
		replace adj_`var' = (L.`var' + F.`var')/2 if low_report==1
	}
	
	* Keep adjusted variables only, except employment
	rename tot_emp tot_emp_unadj
	drop wage*	
	rename adj_* *

	la var tot_emp_unadj "Non-policy firm total employment, unadjusted"
	la var tot_emp "Non-policy firm total employment, adjusted for low reporting"
	
	* Generate share at wage bin variables
	rename wage_lt8 wage_7
	rename wage_gt30 wage_30
	
	forval i=8(1)29{
		gen share_wage_`i'=wage_`i'/(tot_emp)
		la var share_wage_`i' "Share at $`i'"
	}			
	gen share_wage_lt8=wage_7/tot_emp
	gen share_wage_gt8=1-share_wage_lt8
	la var share_wage_lt8 "Share wage < $8"
	la var share_wage_gt8 "Share wage > $8"
	local wage_var = "wage_7"
	forval w=8(1)29{
		local str_w=" + wage_`w'"
		local wage_var = "`wage_var'" + "`str_w'"
		di "`wage_var'"
		local j=`w'+1
		gen share_wage_lt`j'=(`wage_var')/tot_emp
		la var share_wage_lt`j' "Share wage < $`j'"
		gen share_wage_gt`j'=1-share_wage_lt`j'
		la var share_wage_gt`j' "Share wage >= $`j'"
	}

	* Generate average wage among workers earning < $30
	foreach num of numlist 7/29 {
		local lognum=log(`num')
		gen avgwage_`num' = `num'*wage_`num'
		gen avglnwage_`num' = `lognum'*wage_`num'
	}
	gen tot_emp_7_29 = tot_emp - wage_30
	egen avg_wage_bins = rowtotal(avgwage_*)
	replace avg_wage_bins = avg_wage_bins/tot_emp_7_29
	egen avg_ln_wage_bins = rowtotal(avglnwage_*)
	replace avg_ln_wage_bins = avg_ln_wage_bins/tot_emp_7_29
	gen tot_emp_8_29 = tot_emp_7_29 - wage_7
	egen avg_wage_bins_gt8 = rowtotal(avgwage_*)
	replace avg_wage_bins_gt8 = (avg_wage_bins_gt8-avgwage_7)/tot_emp_8_29
	egen avg_ln_wage_bins_gt8 = rowtotal(avglnwage_*)
	replace avg_ln_wage_bins_gt8 = (avg_ln_wage_bins_gt8-avglnwage_7)/tot_emp_8_29
	drop avgwage* avglnwage*
	
	gen ln_avg_wage_exact_gt8=log(avg_wage_exact_gt8)
	gen ln_avg_wage_exact=log(ln_avg_wage_exact)
	gen ln_avg_wage_bins=log(avg_wage_bins)
	
	* Merge in CZ characteristics
	merge m:1 cz using "$data/opportunity_insights/raw/onlinedata8.dta", keepusing(pop2000 czname stateabbrv) keep (3) nogen // 6 CZs from the OI data are not present in the firm data
	gen czname_long=czname+ ", " + stateabbrv

	* Merge in state MW information
	preserve
		use "$data/minimum_wages/raw/mw_state_stata/mw_state_monthly.dta", clear
		gen year=yofd(dofm(monthly_date)) // Create year variable
		sum monthly_date
		keep if monthly_date==r(max)
		keep if mean_mw==7.25
		keep stateabb
		rename stateabb stateabbrv

		tempfile states_no_mw
		save `states_no_mw'
	restore
	
	merge m:1 stateabbrv using `states_no_mw'
	gen no_state_mw=(_merge==3)
	drop _merge
	
	* Merge in all new hires
	preserve
		* Load new hires data
		foreach dateset in "201301_201712" "201801_202112" "202201_202308" {
			insheet using "$data/cb/raw/[Project Name]_result1_3_czone_allcompany_`dateset'.csv", clear
				drop if inlist(cmp_company_code,22222, 44444, 55555, 66666, 33333, 11111 )
			tempfile newhires_`dateset'
			save `newhires_`dateset'', replace
		}
		clear
		foreach dateset in "201301_201712" "201801_202112" "202201_202308" {
			append using `newhires_`dateset''
		}
		
		* Create monthly date and year variables 
		gen str7 str_monthyear = substr(string(wt_original), 1, 4) + "-" + substr(string(wt_original), 5, 2)
		drop if strlen(string(wt_original)) == 5 // duplicates
		gen mdate = monthly(str_monthyear, "YM")
		format mdate %tm
		gen month=month(dofm(mdate))
		gen year=yofd(dofm(mdate))	
		
		* Rename CZ variable
		rename czone cz
			
		* Generate total new hires variable
		egen tot_new_hires=rowtotal(wage_lt8 wage_8 wage_9 wage_10 wage_11 wage_12 wage_13 wage_14 wage_15 wage_16 wage_17 wage_18 wage_19 wage_20 wage_21 wage_22 wage_23 wage_24 wage_25 wage_26 wage_27 wage_28 wage_29 wage_gt30)		
		la var tot_new_hires "Total new hires"
		egen tot_new_hires_7_29=rowtotal(wage_lt8 wage_8 wage_9 wage_10 wage_11 wage_12 wage_13 wage_14 wage_15 wage_16 wage_17 wage_18 wage_19 wage_20 wage_21 wage_22 wage_23 wage_24 wage_25 wage_26 wage_27 wage_28 wage_29)		
		la var tot_new_hires_7_29 "Total new hires with wage < $30"
		drop wage* 
		
		* Clean
		keep cmp_company_code cz mdate month year tot_new_hires* 

		* Save
		tempfile new_hires_full
		save `new_hires_full', replace	
	restore 
	
	merge 1:1 cmp_company_code mdate cz using `new_hires_full'
	drop if _merge==2
	replace tot_new_hires=0 if _merge==1
	drop _merge
	
	* Label
	la var avg_wage_bins "Non-policy firm average wage < $30"
	la var avg_ln_wage_bins "Non-policy firm average log wage < $30"
	la var ln_avg_wage_exact_gt8 "Log non-policy firm exact average wage $8-$29"
	la var ln_avg_wage_exact "Log non-policy firm exact average wage < $30"
	la var ln_avg_wage_bins "Log non-policy firm average wage bin < $30"
	
	la var tot_new_hires "New hires"
	la var tot_new_hires_7_29 "New hires <$30"

	* Generate log employment variable
	gen ln_tot_emp_7_29=log(tot_emp_7_29)
	la var ln_tot_emp_7_29 "Non-policy firm log total employment making less than $30 an hour"

	* Save
	save "$data/cb/clean_nonpolicy_firm_cz.dta", replace	
	
* ------------------------------------------------------------------------------
*13. Dataset on non-policy firm flows to policy firms
* ------------------------------------------------------------------------------
	* Store tempfile of archive to mdate crosswalk
	insheet using "$data/cb/documentation/archive_mdate_crosswalk.csv", clear
	tempfile archive_mdate_crosswalk
	save `archive_mdate_crosswalk'	

	* Load event info
	use "$data/cb/events.dta", clear
	
	* Loop through experiments
	levelsof eventid, local(exp)
	foreach experiment in `exp' {
		
		* Keep relevant row
		use in `experiment' using "$data/cb/events.dta", clear
		
		* Identify date
		merge 1:1 archive using `archive_mdate_crosswalk'
		
		* Store relevant parameters
		foreach vr in cmp_company_code mdate {
			local `vr'=`vr'[1]
		}
		
		local flows_start_date=`mdate'-24
		local flows_end_date=`mdate'-1
	
		* Load data on new company of policy separations at the month X CZ level
		insheet using "$data/cb/raw/[Project Name]_result1_7_czone_nonpol_pol_hire_date.csv", clear
		
		* Create monthly date and year variables 
		gen str7 str_monthyear = substr(string(wt_), 1, 4) + "-" + substr(string(wt_), 5, 2) + "-01"
		gen mdate = monthly(str_monthyear, "YM")
		format mdate %tm
		gen month=month(dofm(mdate))
		gen year=yofd(dofm(mdate))
				
		* Keep only the new hires from the policy company
		keep if cmp_company_code_new=="`cmp_company_code'"
		
		* Keep all months prior to policy event
		keep if inrange(mdate,`flows_start_date', `flows_end_date')
		
		* Collapse to non-policy firm level, drop companies with multiple industries
		collapse (mean) mean_sep_to_policy = freq, by(cmp_company_code_previous)
		
		* Rename nonpolicy company code
		rename cmp_company_code_previous cmp_company_code
		
		* Save total inflows from policy company by new company
		tempfile flow_`experiment'
		save `flow_`experiment''
	
		* Read in clean nonpolicy firm information for total new hires
		use "$data/cb/clean_nonpolicy_firm_cz.dta", clear
		
		* Restrict to the months before the event
		keep if inrange(mdate,`flows_start_date', `flows_end_date')
		
		* Collapse to company level to get the total number of new hires by that company in the months before the policy event
		collapse (mean) tot_emp_pre=tot_emp, by(cmp_company_code)	
		
		* Merge in flows for experiment
		merge 1:1 cmp using `flow_`experiment''
		replace mean_sep_to_policy=0 if _merge==1
		gen sep_to_policy_rate=mean_sep_to_policy/tot_emp_pre
			
		* Save experiment code
		gen trt_exp = `experiment'
		
		* Clean
		keep cmp_company_code sep_to_policy_rate mean_sep_to_policy tot_emp_pre trt_exp
		label variable cmp_company_code "Non-policy company code"

		label variable trt_exp "Experiment code"
		
		* Save
		save "$data/cb/nonpolicy_separations_to_policy_`experiment'.dta", replace
		
	}
	
* ------------------------------------------------------------------------------
*14. Dataset on policy firm flows to non-policy firms
* ------------------------------------------------------------------------------
	* Store tempfile of archive to mdate crosswalk
	insheet using "$data/cb/documentation/archive_mdate_crosswalk.csv", clear
	tempfile archive_mdate_crosswalk
	save `archive_mdate_crosswalk'	

	* Load event info
	use "$data/cb/events.dta", clear
	
	* Loop through experiments
	levelsof eventid, local(exp)
	foreach experiment in `exp' {
		
		* Keep relevant row
		use in `experiment' using "$data/cb/events.dta", clear
		
		* Identify date
		merge 1:1 archive using `archive_mdate_crosswalk'
		
		* Store relevant parameters
		foreach vr in cmp_company_code mdate {
			local `vr'=`vr'[1]
		}

		local flows_start_date=`mdate'-24
		local flows_end_date=`mdate'-1	
	
		* Load clean flows data
		use "$data/cb/raw_policy_to_nonpolicy_new_hires.dta", clear
		
		* Select policy company
		keep if cmp_company_code_previous=="`cmp_company_code'"
		
		* Select observations in calendar month-year prior to policy
		keep if inrange(mdate,`flows_start_date', `flows_end_date')
		
		* Collapse to non-policy firm level, drop companies with multiple industries
		collapse (mean) new_hires_from_policy = freq , by(cmp_company_code )
		
		* Save max inflow of new hires
		tempfile flow_`experiment'
		save `flow_`experiment''
	
		* Read in clean nonpolicy firm information for total new hires
		use "$data/cb/clean_nonpolicy_firm_cz.dta", clear
		
		* Restrict to the months before the event
		keep if inrange(mdate,`flows_start_date', `flows_end_date')
		
		* Collapse to company level to get the total number of new hires by that company in the months before the policy event
		collapse (mean) tot_emp_pre=tot_emp (sum) tot_new_hires_pre = tot_new_hires, by(cmp_company_code)	
		
		* Merge in flows for experiment
		merge 1:1 cmp using `flow_`experiment''
		replace new_hires_from_policy=0 if _merge==1
		drop _merge
		gen share_new_hires_policy=new_hires_from_policy/tot_new_hires_pre
		gen new_hires_policy_rate=new_hires_from_policy/tot_emp_pre
		
		* Save experiment code
		gen trt_exp = `experiment'
		
		* Clean
		keep cmp_company_code new_hires_from_policy new_hires_policy_rate share_new_hires_policy tot_new_hires_pre tot_emp_pre trt_exp
		label variable cmp_company_code "Non-policy company code"
		label variable trt_exp "Experiment code"
		
		* Save
		save "$data/cb/nonpolicy_new_hires_from_policy_`experiment'.dta", replace
		
	}
		
* ------------------------------------------------------------------------------
*15. Average non-policy CZ wages and employment 
* ------------------------------------------------------------------------------
	use "$data/cb/raw_non_policy_firm_dataset.dta", clear
	
	* Restrict to main feeder industries to policy firms
	keep if sample_ind == 1

	* Interpolate employment for non-reporting months identified as outliers in employment series.
	preserve
		collapse (sum) tot_emp, by(mdate cmp_company_code)
		xtset cmp_company_code mdate
		tssmooth ma emp_ma = tot_emp, window(1 1 1)
		gen outlier = tot_emp < emp_ma*.75
		keep mdate cmp_company_code outlier
		tempfile outliers
		save `outliers', replace
	restore

	merge m:1 mdate cmp_company_code using `outliers', nogen

	* Set as panel data
	egen id = group(cmp_company_code cz)
	xtset id mdate

	foreach var of varlist wage* tot_emp {
		gen adj_`var' = `var'
		replace adj_`var' = (L.`var' + F.`var')/2 if outlier==1
	}
	
	* Keep adjusted variables only, except employment
	rename tot_emp tot_emp_unadj
	drop outlier
	drop wage*	
	rename adj_* *

	la var tot_emp_unadj "Non-policy firm total employment, unadjusted"
	la var tot_emp "Non-policy firm total employment, adjusted for low reporting"
	
	* Generate share at wage bin variables
	rename wage_lt8 wage_7
	rename wage_gt30 wage_30
	
	* Generate average wage among workers earning < $30	
	drop wage_30
	keep wage_* mdate cmp_company_code cz 
	
	* Reshape long
	reshape long wage_, i(mdate cmp_company_code cz) j(dollar)
	rename wage_ n_workers
	
	* Generate weighted average
	egen tot_emp_7_30=sum(n_workers),by( mdate cz)
	gen wt_workers=n_workers/tot_emp_7_30
	gen weighted_wage=wt_workers*dollar
	collapse (sum) avg_wage_bins=weighted_wage (mean) tot_emp_7_30, by( mdate cz)
	
	* Clean
	gen ln_avg_wage_bins=log(avg_wage_bins)
	keep avg_wage_bins ln_avg_wage_bins mdate cz tot_emp_7_30
	
	* Save
	tempfile avg_wage
	save `avg_wage'
	
	* Label
	la var avg_wage_bins "Non-policy average wage < $30"
	la var ln_avg_wage_bins "Non-policy log average wage < $30"
	
	* Generate log employment variable
	gen ln_tot_emp_7_29=log(tot_emp_7_30)
	la var ln_tot_emp_7_29 "Non-policy firm log total employment making less than $30 an hour"

	* Save
	save "$data/cb/clean_nonpolicy_cz.dta", replace	

* ------------------------------------------------------------------------------
*16. Clean poaching/feeder raw file for analysis of predictive power of poach/feed flag
* ------------------------------------------------------------------------------
	* Merge new hires from policy
	use "$data/cb/raw_policy_to_nonpolicy_new_hires.dta", clear
	
	* Collapse across wage bins
	collapse (sum) new_hires = freq , by(cmp_company_code cmp_company_code_previous cz mdate)
	
	reshape wide new_hires,i(cmp_company_code cz mdate) j(cmp_company_code_previous) string
	egen all_new_hires=rowtotal(new_hires*)
	tempfile new_hires_from_policy

	save "$data/cb/raw_policy_to_nonpolicy_new_hires_collapsed.dta", replace

	* Merge in separations to policy
	insheet using "$data/cb/raw/[Project Name]_result1_7_czone_nonpol_pol_hire_date.csv", clear
	
	* Create monthly date and year variables 
	gen str7 str_monthyear = substr(string(wt_), 1, 4) + "-" + substr(string(wt_), 5, 2) + "-01"
	gen mdate = monthly(str_monthyear, "YM")
	format mdate %tm
	gen month=month(dofm(mdate))
	gen year=yofd(dofm(mdate))	
	
	rename cmp_company_code_previous cmp_company_code
	rename czone cz
	drop wt_original_hire_date_new str_monthyear month year
	rename freq separations_to_policy
	reshape wide separations_to_policy, i(cmp_company_code mdate cz) j(cmp_company_code_new) string
	egen all_separations=rowtotal(separations_to_policy*)
	tempfile separations_to_policy
	
	save "$data/cb/raw_policy_to_nonpolicy_separations_collapsed.dta", replace

* ------------------------------------------------------------------------------
*17. Policy firm first quarter wage bill share out of QCEW aggregate
* ------------------------------------------------------------------------------

	use "$data/qcew/qcew_cz_all_industries_final.dta", clear
	
	bysort czone qdate: gen order=_n
	keep if order==1
	drop order
	keep if qtr==1
	
	keep cz_total_qtrly_wages cz_avg_emp czone qdate qtr
	
	rename czone cz
	rename qtr quarter
	tempfile qcew_totals
	save `qcew_totals', replace
	
	foreach company in "11111" "66666" "55555" "44444" "22222" "33333" {
		
		* Load data
		if "`company'"=="22222"{
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_22222_part1.csv", clear 
			tempfile part1
			save `part1'
			
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_22222_part2.csv", clear 
			append using `part1'		
		}
		else {
			insheet using "$data/cb/raw/[Project Name]_result1_2_czone_`company'.csv", clear 
		}
	
		gen wage_bin=subinstr(wage_flag,"wage_","",.)
		replace wage_bin="30" if wage_bin=="gt30"
		replace wage_bin="7" if wage_bin=="lt8"
		replace wage_bin="" if wage_bin=="null"
		destring wage_bin, replace
		drop if wage_bin==.
		drop wage_flag
		
		* Create monthly date and year variables 
		gen str7 str_monthyear = substr(string(archive), 1, 4) + "-" + substr(string(archive), 5, 2) + "-01"
		gen mdate = monthly(str_monthyear, "YM")
		format mdate %tm
		gen month=month(dofm(mdate))
		gen year=yofd(dofm(mdate))
		la var year "Year"
		drop str_monthyear
		
		gen qdate = qofd(dofm(mdate))
		format qdate %tq
		gen quarter=quarter(dofm(mdate))
		
		keep if inlist(month, 3)
		
		quietly bysort czone wage_bin year: gen dup = cond(_N==1,0,_n)
			
		collapse (sum) total_gross_pay_ytd freq_gross_pay_ytd, by(cmp_company_code czone qdate quarter year)
			
		* Save
		tempfile clean_quarterly_`company'
		save `clean_quarterly_`company''
}
	* Append
	clear
	foreach company in "11111" "66666" "55555" "44444" "22222" "33333"{
		append using `clean_quarterly_`company''
	}	

	rename czone cz 
	merge m:1 cz qdate using `qcew_totals', keep (3) nogen
	
	gen wage_bill_share_q1=total_gross_pay_ytd/cz_total_qtrly_wages
	gen emp_share_q1=freq_gross_pay_ytd/cz_avg_emp	
	
	* Save
	save "$data/cb/clean_policy_firm_q1_wagebill_cz.dta", replace
	
* ------------------------------------------------------------------------------
*18. All gap measures
* ------------------------------------------------------------------------------
	use "$data/cb/clean_policy_firm_cz.dta", clear
	gen policy_firm=1
	append using "$data/cb/clean_nonpolicy_firm_cz.dta"
	replace policy_firm=0 if policy_firm==.
	tempfile fulldata
	save `fulldata'

	* Store tempfile of archive to mdate crosswalk
	insheet using "$data/cb/documentation/archive_mdate_crosswalk.csv", clear
	tempfile archive_mdate_crosswalk
	save `archive_mdate_crosswalk'	

	* Load event info
	use "$data/cb/events.dta", clear
	
	* Drop overlapping events
	drop if min(months_since_last_policy, months_until_next_policy) <= 6
	
	levelsof eventid, local(exp)
	* Build panel for each event 
	foreach experiment in `exp' {
		
		* Keep relevant row
		use in `experiment' using "$data/cb/events.dta", clear
		
		* Identify date
		merge 1:1 archive using `archive_mdate_crosswalk'
		
		* Store relevant parameters
		foreach vr in cmp_company_code mw share_affected archive mdate months_since_last_policy months_until_next_policy {
			local `vr'=`vr'[1]
		}
		local bmw=`mw'-1
		local amw=`mw'-1
		
		noi di in green "Experiment `experiment': " in yellow `"`cmp_company_code' (`mw') `mdate'"' 
		
	use `fulldata', clear
	* Create event time variable
	gen etime=mdate-`mdate'-1
	 
	local event_start=-12
	local event_end=11
	keep if inrange(etime, `event_start',`event_end')

	gen trt_exp= `experiment'
	gen mw=`mw'
	gen share_affected=`share_affected'
	gen months_since_last_policy=`months_since_last_policy'
	gen months_until_next_policy=`months_until_next_policy'

	* Generate outcomes equal to share of workers paid below, at and above minimum wage
	gen share_wage_ltmw=share_wage_lt`mw'
	gen share_wage_mw=wage_`mw'/tot_emp
	gen share_wage_gtmw=wage_gt`mw'/tot_emp
	
	preserve
	* Create gap measure for policy company only
	keep if cmp==`cmp_company_code'
	* NEW: Restrict to CZ cells with at least 30 *actual* emp at any time during sample period
	bysort cz: egen min_emp_cz = min(tot_emp)
	drop if min_emp_cz < 30
		
	* Specify start and end dates for gap measure
	local gap_measure_start=-6
	di `gap_measure_start'
	local gap_measure_end=`gap_measure_start'+3
	di `gap_measure_end'
	
	* For wage bins below the minimum wage, the gap is the number of emp per bin times the value of the bin
	forval i=7(1)`bmw'{ 
		local n = `mw'-`i'
		gen gap_`n'=(wage_`i')*`n'
	}

	* For wage bins at or above the minimum wage, the gap is set to 0
	forval i=`mw'(1)29{ 
		gen gap_`i'=0
	}

	* The denominator for each bin is the number of emp per bin times the value of the bin
	forval i=7(1)29{ 
		gen sumwage_`i'=wage_`i'*`i'
	}

	* Gap numerator is sum of gap per bin
	egen sum_gap=rowtotal(gap*)

	* Gap denominator is sum of denominator per bin
	egen sum_wage=rowtotal(sumwage*)

	* Final gap measure is numerator divided by denominator
	gen gap=sum_gap/sum_wage	
	drop sum*
	qui replace gap=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui egen avggap=mean(gap), by(cz) 

	mdesc avggap			
	drop if avggap==.

	rename avggap T
	sum T,d
	
	* Create placebo gap measure	
	* Specify start and end dates for gap measure
	local placebo_gap_start=-12
	di `placebo_gap_start'
	local placebo_gap_end=`placebo_gap_start'+3
	di `placebo_gap_end'

	* For wage bins below the minimum wage, the gap is the number of emp per bin times the value of the bin
	forval i=7(1)`bmw'{ 
		local n = `mw'-`i'
		gen placebo_gap_`n'=(wage_`i')*`n'
	}

	* For wage bins at or above the minimum wage, the gap is set to 0
	forval i=`mw'(1)29{ 
		gen placebo_gap_`i'=0
	}

	* The denominator for each bin is the number of emp per bin times the value of the bin
	forval i=7(1)29{ 
		gen sumwage_`i'=wage_`i'*`i'
	}

	* Gap numerator is sum of gap per bin
	egen sum_placebo_gap=rowtotal(placebo_gap*)

	* Gap denominator is sum of denominator per bin
	egen sum_wage=rowtotal(sumwage*)

	* Final gap measure is numerator divided by denominator
	gen placebo_gap=sum_placebo_gap/sum_wage			
	qui replace placebo_gap=. if !inrange(etime,`placebo_gap_start',`placebo_gap_end') 	
	qui egen avgplacebogap=mean(placebo_gap), by(cz) 	
	
	* Save T to merge with nonpolicy stacked dataset
	contract T avgplacebogap cz
	tempfile company_gap
	save `company_gap'
	restore 

	* Create super gap and wage bill share measure	
	* Specify start and end dates for gap and share measures
	local gap_measure_start=-6
	di `gap_measure_start'
	local gap_measure_end=`gap_measure_start'+3
	di `gap_measure_end'

	* For wage bins below the minimum wage, the gap is the number of workers per bin times the value of the bin, only at the policy company
	forval i=7(1)`bmw'{ 
		local n = `mw'-`i'
		gen gap_`n'=(wage_`i')*`n'
		replace gap_`n'=0 if cmp_company_code!=`cmp_company_code'
	}

	* For wage bins at or above the minimum wage, the gap is set to 0
	forval i=`mw'(1)29{ 
		gen gap_`i'=0
	}
	
	* For the wage bill share, the numerator is the total wage bill at the policy company
	forval i=7(1)29{ 
		gen wage_bill_share_`i'=(wage_`i')*`i'
		replace wage_bill_share_`i'=0 if cmp_company_code!=`cmp_company_code'
	}

	* The denominator is the total wage bill - sum of number of workers per bin times the value of the bin
	forval i=7(1)29{ 
		gen sumwage_`i'=wage_`i'*`i'
	}

	* For the employment share, the numerator is the total employment at the policy company
	forval i=7(1)29{ 
		gen employment_share_`i'=(wage_`i')
		replace employment_share_`i'=0 if cmp_company_code!=`cmp_company_code'
	}

	* The denominator is total employment - sum of number of workers per bin
	forval i=7(1)29{ 
		gen sumemp_`i'=wage_`i'
	}
	
	* Gap numerator is sum of gap per bin
	egen sum_gap=rowtotal(gap*)
	
	* Wage bill share numerator is sum of wage bill
	egen sum_wage_bill_share=rowtotal(wage_bill_share*)

	* Employment share numerator is sum of employment share
	egen sum_employment_share=rowtotal(employment_share*)
	
	* Denominator for gap and wage bill share is sum of wage bill per bin
	egen sum_wage=rowtotal(sumwage*)

	* Denominator for employment is sum of workers per bin
	egen sum_emp=rowtotal(sumemp*)
	
	* Collapse to CZ-month level
	collapse (sum) sum_gap sum_wage sum_emp sum_wage_bill_share sum_employment_share, by(cz mdate etime)

	* Final gap measure is numerator divided by denominator
	gen gap=sum_gap/sum_wage	
	drop sum_gap
	qui replace gap=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui egen avggap=mean(gap), by(cz) 
	
	* Final wage bill share measure is numerator divided by denominator
	gen wage_bill_share_monthly=sum_wage_bill_share/sum_wage	
	qui replace wage_bill_share_monthly=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui egen wage_bill_share=mean(wage_bill_share_monthly), by(cz) 
	
	* Bring in alternative denominators for employment share
	rename cz czone
	merge 1:1 czone mdate using "$data/qcew/qcew_emp_cz", keep(3) nogen keepusing(emp_main_ind emp_all)
	rename czone cz

	* Final employment share measure is numerator divided by denominator
	gen employment_share_monthly=sum_employment_share/sum_emp
	gen employment_share_monthly_q=sum_employment_share/emp_main_ind
	gen employment_share_monthly_q_all=sum_employment_share/emp_all
	drop sum*
	qui replace employment_share_monthly=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui replace employment_share_monthly_q=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui replace employment_share_monthly_q_all=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui egen employment_share=mean(employment_share_monthly), by(cz) 
	qui egen employment_share_q=mean(employment_share_monthly_q), by(cz) 
	qui egen employment_share_q_all=mean(employment_share_monthly_q_all), by(cz) 

	***
	mdesc wage_bill_share
	mdesc avggap			
	drop if avggap==.

	gen super_gap=avggap
	sum super_gap,d

	keep super_gap wage_bill_share employment_share employment_share_q employment_share_q_all mdate cz
	contract super_gap wage_bill_share employment_share employment_share_q employment_share_q_all cz
	merge 1:1 cz using `company_gap', nogen keep (3)
	
	xtile gap_decile = T, nq(10)
	xtile super_gap_decile = super_gap, nq(10)
	xtile wage_bill_share_decile = wage_bill_share, nq(10)
	xtile employment_share_decile = employment_share, nq(10)
	xtile employment_share_q_decile = employment_share_q, nq(10)
	xtile employment_share_q_all_decile = employment_share_q_all, nq(10)
	
	gen trt_exp=`experiment'

	save "$data/cb/all_gap_measures_`experiment'.dta", replace	
	}

* ------------------------------------------------------------------------------
*19. All gap measures dataset with policy firm first quarter wage bill info
* ------------------------------------------------------------------------------

	* Store tempfile of archive to mdate crosswalk
	insheet using "$data/cb/documentation/archive_mdate_crosswalk.csv", clear
	tempfile archive_mdate_crosswalk
	save `archive_mdate_crosswalk'	
	
	* Select events
	use "$data/cb/events.dta", clear
	
	* Drop overlapping events
	drop if min(months_since_last_policy, months_until_next_policy) <= 6	
	
	* Load experiment details
	levelsof eventid, local(exp)
	foreach experiment in `exp' {	
		
		* Keep relevant row
		use in `experiment' using "$data/cb/events.dta", clear
		
		* Identify date
		merge 1:1 archive using `archive_mdate_crosswalk'
		
		* Get quarter
		gen qdate = qofd(dofm(mdate))
		gen trtyear = yofd(dofm(mdate))
		gen trtquarter=quarter(dofm(mdate))

		
		* Store relevant parameters
		foreach vr in cmp_company_code mw share_affected archive mdate qdate trtyear trtquarter months_since_last_policy months_until_next_policy {
			local `vr'=`vr'[1]
		}
		local bmw=`mw'-1
		local amw=`mw'-1
		
		clear
		
		noi di in green "Experiment `experiment': " in yellow `"`cmp_company_code' (`mw') `mdate'"'
		
		* Load quartlery CZ wage and employment data
		use "$data/cb/clean_policy_firm_q1_wagebill_cz.dta", clear
		
		* Fill in experiment parameters
		gen trt_exp= `experiment'
		gen share_affected=`share_affected'
		gen months_since_last_policy=`months_since_last_policy'
		gen months_until_next_policy=`months_until_next_policy'
		gen etime=qdate-`qdate'-1
		gen mw=`mw'
		gen trtquarter=`trtquarter'
		keep if cmp_company_code==`cmp_company_code'

		if trtquarter==1{
			keep if etime==-5
		} 
		else{
			keep if etime<0 & year==`trtyear'
		}
		
		* Merge in avg gap calculations for the policy firm
		merge 1:1 cz using "$data/cb/all_gap_measures_`experiment'.dta", nogen keepusing(T wage_bill* employment_share* super* avgplacebogap) keep (3)
		
		save "$data/cb/all_gap_measures_`experiment'_qtrly_wage_bill.dta", replace
	}
	
* ------------------------------------------------------------------------------
*20. Super gap measure restricted to sample ind
* ------------------------------------------------------------------------------
	
use "$data/cb/clean_policy_firm_cz.dta", clear
	gen policy_firm=1
	append using "$data/cb/clean_nonpolicy_firm_cz.dta"
	keep if sample_ind==1 | policy_firm==1
	replace policy_firm=0 if policy_firm==.
	tempfile fulldata
	save `fulldata'

	* Store tempfile of archive to mdate crosswalk
	insheet using "$data/cb/documentation/archive_mdate_crosswalk.csv", clear
	tempfile archive_mdate_crosswalk
	save `archive_mdate_crosswalk'	

	* Load event info
	use "$data/cb/events.dta", clear
	
	* Drop overlapping events
	drop if min(months_since_last_policy, months_until_next_policy) <= 6
	
	levelsof eventid, local(exp)
	* Build panel for each event 
	foreach experiment in `exp' {
		
		* Keep relevant row
		use in `experiment' using "$data/cb/events.dta", clear
		
		* Identify date
		merge 1:1 archive using `archive_mdate_crosswalk'
		
		* Store relevant parameters
		foreach vr in cmp_company_code mw share_affected archive mdate months_since_last_policy months_until_next_policy {
			local `vr'=`vr'[1]
		}
		local bmw=`mw'-1
		local amw=`mw'-1
		
		noi di in green "Experiment `experiment': " in yellow `"`cmp_company_code' (`mw') `mdate'"' 
		
	use `fulldata', clear
	* Create event time variable
	gen etime=mdate-`mdate'-1
	 
	local event_start=-12
	local event_end=11
	keep if inrange(etime, `event_start',`event_end')

	gen trt_exp= `experiment'
	gen mw=`mw'
	gen share_affected=`share_affected'
	gen months_since_last_policy=`months_since_last_policy'
	gen months_until_next_policy=`months_until_next_policy'

	* Generate outcomes equal to share of workers paid below, at and above minimum wage
	gen share_wage_ltmw=share_wage_lt`mw'
	gen share_wage_mw=wage_`mw'/tot_emp
	gen share_wage_gtmw=wage_gt`mw'/tot_emp
	
	preserve
	* Create gap measure for policy company only
	keep if cmp==`cmp_company_code'
	* NEW: Restrict to CZ cells with at least 30 *actual* emp at any time during sample period
	bysort cz: egen min_emp_cz = min(tot_emp)
	drop if min_emp_cz < 30
		
	* Specify start and end dates for gap measure
	local gap_measure_start=-6
	di `gap_measure_start'
	local gap_measure_end=`gap_measure_start'+3
	di `gap_measure_end'
	
	* For wage bins below the minimum wage, the gap is the number of emp per bin times the value of the bin
	forval i=7(1)`bmw'{ 
		local n = `mw'-`i'
		gen gap_`n'=(wage_`i')*`n'
	}

	* For wage bins at or above the minimum wage, the gap is set to 0
	forval i=`mw'(1)29{ 
		gen gap_`i'=0
	}

	* The denominator for each bin is the number of emp per bin times the value of the bin
	forval i=7(1)29{ 
		gen sumwage_`i'=wage_`i'*`i'
	}

	* Gap numerator is sum of gap per bin
	egen sum_gap=rowtotal(gap*)

	* Gap denominator is sum of denominator per bin
	egen sum_wage=rowtotal(sumwage*)

	* Final gap measure is numerator divided by denominator
	gen gap=sum_gap/sum_wage	
	drop sum*
	qui replace gap=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui egen avggap=mean(gap), by(cz) 

	mdesc avggap			
	drop if avggap==.

	rename avggap T
	sum T,d
	
	* Create placebo gap measure	
	* Specify start and end dates for gap measure
	local placebo_gap_start=-12
	di `placebo_gap_start'
	local placebo_gap_end=`placebo_gap_start'+3
	di `placebo_gap_end'

	* For wage bins below the minimum wage, the gap is the number of emp per bin times the value of the bin
	forval i=7(1)`bmw'{ 
		local n = `mw'-`i'
		gen placebo_gap_`n'=(wage_`i')*`n'
	}

	* For wage bins at or above the minimum wage, the gap is set to 0
	forval i=`mw'(1)29{ 
		gen placebo_gap_`i'=0
	}

	* The denominator for each bin is the number of emp per bin times the value of the bin
	forval i=7(1)29{ 
		gen sumwage_`i'=wage_`i'*`i'
	}

	* Gap numerator is sum of gap per bin
	egen sum_placebo_gap=rowtotal(placebo_gap*)

	* Gap denominator is sum of denominator per bin
	egen sum_wage=rowtotal(sumwage*)

	* Final gap measure is numerator divided by denominator
	gen placebo_gap=sum_placebo_gap/sum_wage			
	qui replace placebo_gap=. if !inrange(etime,`placebo_gap_start',`placebo_gap_end') 	
	qui egen avgplacebogap=mean(placebo_gap), by(cz) 	
	
	* Save T to merge with nonpolicy stacked dataset
	contract T avgplacebogap cz
	tempfile company_gap
	save `company_gap'
	restore 

	* Create super gap and wage bill share measure	
	* Specify start and end dates for gap and share measures
	local gap_measure_start=-6
	di `gap_measure_start'
	local gap_measure_end=`gap_measure_start'+3
	di `gap_measure_end'

	* For wage bins below the minimum wage, the gap is the number of workers per bin times the value of the bin, only at the policy company
	forval i=7(1)`bmw'{ 
		local n = `mw'-`i'
		gen gap_`n'=(wage_`i')*`n'
		replace gap_`n'=0 if cmp_company_code!=`cmp_company_code'
	}

	* For wage bins at or above the minimum wage, the gap is set to 0
	forval i=`mw'(1)29{ 
		gen gap_`i'=0
	}
	
	* For the wage bill share, the numerator is the total wage bill at the policy company
	forval i=7(1)29{ 
		gen wage_bill_share_`i'=(wage_`i')*`i'
		replace wage_bill_share_`i'=0 if cmp_company_code!=`cmp_company_code'
	}

	* The denominator is the total wage bill - sum of number of workers per bin times the value of the bin
	forval i=7(1)29{ 
		gen sumwage_`i'=wage_`i'*`i'
	}

	* For the employment share, the numerator is the total employment at the policy company
	forval i=7(1)29{ 
		gen employment_share_`i'=(wage_`i')
		replace employment_share_`i'=0 if cmp_company_code!=`cmp_company_code'
	}

	* The denominator is total employment - sum of number of workers per bin
	forval i=7(1)29{ 
		gen sumemp_`i'=wage_`i'
	}
	
	* Gap numerator is sum of gap per bin
	egen sum_gap=rowtotal(gap*)
	
	* Wage bill share numerator is sum of wage bill
	egen sum_wage_bill_share=rowtotal(wage_bill_share*)

	* Employment share numerator is sum of employment share
	egen sum_employment_share=rowtotal(employment_share*)
	
	* Denominator for gap and wage bill share is sum of wage bill per bin
	egen sum_wage=rowtotal(sumwage*)

	* Denominator for employment is sum of workers per bin
	egen sum_emp=rowtotal(sumemp*)
	
	* Collapse to CZ-month level
	collapse (sum) sum_gap sum_wage sum_emp sum_wage_bill_share sum_employment_share, by(cz mdate etime)

	* Final gap measure is numerator divided by denominator
	gen gap=sum_gap/sum_wage	
	drop sum_gap
	qui replace gap=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui egen avggap=mean(gap), by(cz) 
	
	* Final wage bill share measure is numerator divided by denominator
	gen wage_bill_share_monthly=sum_wage_bill_share/sum_wage	
	qui replace wage_bill_share_monthly=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui egen wage_bill_share=mean(wage_bill_share_monthly), by(cz) 
	
	* Bring in alternative denominators for employment share
	rename cz czone
	merge 1:1 czone mdate using "$data/qcew/qcew_emp_cz", keep(3) nogen keepusing(emp_main_ind emp_all)
	rename czone cz

	* Final employment share measure is numerator divided by denominator
	gen employment_share_monthly=sum_employment_share/sum_emp
	gen employment_share_monthly_q=sum_employment_share/emp_main_ind
	gen employment_share_monthly_q_all=sum_employment_share/emp_all
	drop sum*
	qui replace employment_share_monthly=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui replace employment_share_monthly_q=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui replace employment_share_monthly_q_all=. if !inrange(etime,`gap_measure_start',`gap_measure_end') 	
	qui egen employment_share=mean(employment_share_monthly), by(cz) 
	qui egen employment_share_q=mean(employment_share_monthly_q), by(cz) 
	qui egen employment_share_q_all=mean(employment_share_monthly_q_all), by(cz) 

	***
	mdesc wage_bill_share
	mdesc avggap			
	drop if avggap==.

	gen super_gap_ind=avggap
	sum super_gap_ind,d

	keep super_gap_ind mdate cz
	contract super_gap_ind cz
	merge 1:1 cz using `company_gap', nogen keep (3)
	
	xtile super_gap_ind_decile = super_gap_ind, nq(10)
	
	gen trt_exp=`experiment'

	save "$data/cb/super_gap_ind_measures_`experiment'.dta", replace	
	}		
